Skip to content

Commit d2ead2c

Browse files
committed
Merge pull request #7000 from jreback/groupby_counts_agg
ENH/BUG: add count to grouper / ensure that grouper keys are not included in the returned
2 parents 97c4a2e + 134dd1f commit d2ead2c

File tree

8 files changed

+189
-63
lines changed

8 files changed

+189
-63
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ API Changes
179179
validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`)
180180
- Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the
181181
``data`` argument (:issue:`5357`)
182+
- groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
183+
as its already the index
182184

183185
Deprecations
184186
~~~~~~~~~~~~

doc/source/v0.14.0.txt

+18-1
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,29 @@ API changes
110110

111111
.. ipython:: python
112112

113-
DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
113+
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
114114
g = df.groupby('A')
115115
g.nth(0) # can also use negative ints
116116

117117
g.nth(0, dropna='any') # similar to old behaviour
118118

119+
groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
120+
as its already the index
121+
122+
.. ipython:: python
123+
124+
df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
125+
g = df.groupby('A')
126+
g.count()
127+
g.describe()
128+
129+
passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0)
130+
131+
df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
132+
g = df.groupby('A',as_index=False)
133+
g.count()
134+
g.describe()
135+
119136
- Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping
120137
by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)
121138

pandas/core/categorical.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -226,11 +226,13 @@ def describe(self):
226226
"""
227227
# Hack?
228228
from pandas.core.frame import DataFrame
229-
grouped = DataFrame(self.labels).groupby(0)
230-
counts = grouped.count().values.squeeze()
229+
counts = DataFrame({
230+
'labels' : self.labels,
231+
'values' : self.labels }
232+
).groupby('labels').count().squeeze().values
231233
freqs = counts / float(counts.sum())
232-
return DataFrame.from_dict({
234+
return DataFrame({
233235
'counts': counts,
234236
'freqs': freqs,
235237
'levels': self.levels
236-
}).set_index('levels')
238+
}).set_index('levels')

pandas/core/generic.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -611,11 +611,19 @@ def __neg__(self):
611611
arr = operator.inv(values)
612612
else:
613613
arr = operator.neg(values)
614-
return self._wrap_array(arr, self.axes, copy=False)
614+
return self.__array_wrap__(arr)
615615

616616
def __invert__(self):
617-
arr = operator.inv(_values_from_object(self))
618-
return self._wrap_array(arr, self.axes, copy=False)
617+
try:
618+
arr = operator.inv(_values_from_object(self))
619+
return self.__array_wrap__(arr)
620+
except:
621+
622+
# inv fails with 0 len
623+
if not np.prod(self.shape):
624+
return self
625+
626+
raise
619627

620628
def equals(self, other):
621629
"""
@@ -707,15 +715,11 @@ def __abs__(self):
707715
#----------------------------------------------------------------------
708716
# Array Interface
709717

710-
def _wrap_array(self, arr, axes, copy=False):
711-
d = self._construct_axes_dict_from(self, axes, copy=copy)
712-
return self._constructor(arr, **d).__finalize__(self)
713-
714718
def __array__(self, dtype=None):
715719
return _values_from_object(self)
716720

717-
def __array_wrap__(self, result):
718-
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
721+
def __array_wrap__(self, result, copy=False):
722+
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=copy)
719723
return self._constructor(result, **d).__finalize__(self)
720724

721725
# ideally we would define this to avoid the getattr checks, but

pandas/core/groupby.py

+78-29
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,23 @@ def _selection_list(self):
445445
return [self._selection]
446446
return self._selection
447447

448+
@cache_readonly
449+
def _selected_obj(self):
450+
451+
if self._selection is None or isinstance(self.obj, Series):
452+
return self.obj
453+
else:
454+
return self.obj[self._selection]
455+
456+
def _set_selection_from_grouper(self):
457+
""" we may need create a selection if we have non-level groupers """
458+
grp = self.grouper
459+
if self._selection is None and self.as_index and getattr(grp,'groupings',None) is not None:
460+
ax = self.obj._info_axis
461+
groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
462+
if len(groupers):
463+
self._selection = (ax-Index(groupers)).tolist()
464+
448465
def _local_dir(self):
449466
return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
450467

@@ -453,7 +470,6 @@ def __getattr__(self, attr):
453470
return object.__getattribute__(self, attr)
454471
if attr in self.obj:
455472
return self[attr]
456-
457473
if hasattr(self.obj, attr):
458474
return self._make_wrapper(attr)
459475

@@ -472,6 +488,10 @@ def _make_wrapper(self, name):
472488
type(self).__name__))
473489
raise AttributeError(msg)
474490

491+
# need to setup the selection
492+
# as are not passed directly but in the grouper
493+
self._set_selection_from_grouper()
494+
475495
f = getattr(self._selected_obj, name)
476496
if not isinstance(f, types.MethodType):
477497
return self.apply(lambda self: getattr(self, name))
@@ -503,7 +523,19 @@ def curried(x):
503523
try:
504524
return self.apply(curried_with_axis)
505525
except Exception:
506-
return self.apply(curried)
526+
try:
527+
return self.apply(curried)
528+
except Exception:
529+
530+
# related to : GH3688
531+
# try item-by-item
532+
# this can be called recursively, so need to raise ValueError if
533+
# we don't have this method to indicated to aggregate to
534+
# mark this column as an error
535+
try:
536+
return self._aggregate_item_by_item(name, *args, **kwargs)
537+
except (AttributeError):
538+
raise ValueError
507539

508540
return wrapper
509541

@@ -624,6 +656,7 @@ def mean(self):
624656
except GroupByError:
625657
raise
626658
except Exception: # pragma: no cover
659+
self._set_selection_from_grouper()
627660
f = lambda x: x.mean(axis=self.axis)
628661
return self._python_agg_general(f)
629662

@@ -639,6 +672,7 @@ def median(self):
639672
raise
640673
except Exception: # pragma: no cover
641674

675+
self._set_selection_from_grouper()
642676
def f(x):
643677
if isinstance(x, np.ndarray):
644678
x = Series(x)
@@ -655,6 +689,7 @@ def std(self, ddof=1):
655689
if ddof == 1:
656690
return self._cython_agg_general('std')
657691
else:
692+
self._set_selection_from_grouper()
658693
f = lambda x: x.std(ddof=ddof)
659694
return self._python_agg_general(f)
660695

@@ -667,15 +702,26 @@ def var(self, ddof=1):
667702
if ddof == 1:
668703
return self._cython_agg_general('var')
669704
else:
705+
self._set_selection_from_grouper()
670706
f = lambda x: x.var(ddof=ddof)
671707
return self._python_agg_general(f)
672708

673709
def size(self):
674710
"""
675711
Compute group sizes
712+
676713
"""
677714
return self.grouper.size()
678715

716+
def count(self, axis=0):
717+
"""
718+
Number of non-null items in each group.
719+
axis : axis number, default 0
720+
the grouping axis
721+
"""
722+
self._set_selection_from_grouper()
723+
return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64')
724+
679725
sum = _groupby_function('sum', 'add', np.sum)
680726
prod = _groupby_function('prod', 'prod', np.prod)
681727
min = _groupby_function('min', 'min', np.min, numeric_only=False)
@@ -685,14 +731,14 @@ def size(self):
685731
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
686732
_convert=True)
687733

734+
688735
def ohlc(self):
689736
"""
690737
Compute sum of values, excluding missing values
691-
692738
For multiple groupings, the result index will be a MultiIndex
693-
694739
"""
695-
return self._cython_agg_general('ohlc')
740+
return self._apply_to_column_groupbys(
741+
lambda x: x._cython_agg_general('ohlc'))
696742

697743
def nth(self, n, dropna=None):
698744
"""
@@ -888,13 +934,6 @@ def _cumcount_array(self, arr=None, **kwargs):
888934
cumcounts[v] = arr[len(v)-1::-1]
889935
return cumcounts
890936

891-
@cache_readonly
892-
def _selected_obj(self):
893-
if self._selection is None or isinstance(self.obj, Series):
894-
return self.obj
895-
else:
896-
return self.obj[self._selection]
897-
898937
def _index_with_as_index(self, b):
899938
"""
900939
Take boolean mask of index to be returned from apply, if as_index=True
@@ -990,12 +1029,23 @@ def _concat_objects(self, keys, values, not_indexed_same=False):
9901029
result = result.reindex(ax)
9911030
else:
9921031
result = result.reindex_axis(ax, axis=self.axis)
993-
elif self.group_keys and self.as_index:
994-
group_keys = keys
995-
group_levels = self.grouper.levels
996-
group_names = self.grouper.names
997-
result = concat(values, axis=self.axis, keys=group_keys,
998-
levels=group_levels, names=group_names)
1032+
1033+
elif self.group_keys:
1034+
1035+
if self.as_index:
1036+
1037+
# possible MI return case
1038+
group_keys = keys
1039+
group_levels = self.grouper.levels
1040+
group_names = self.grouper.names
1041+
result = concat(values, axis=self.axis, keys=group_keys,
1042+
levels=group_levels, names=group_names)
1043+
else:
1044+
1045+
# GH5610, returns a MI, with the first level being a
1046+
# range index
1047+
keys = list(range(len(values)))
1048+
result = concat(values, axis=self.axis, keys=keys)
9991049
else:
10001050
result = concat(values, axis=self.axis)
10011051

@@ -2187,6 +2237,9 @@ def true_and_notnull(x, *args, **kwargs):
21872237
filtered = self._apply_filter(indices, dropna)
21882238
return filtered
21892239

2240+
def _apply_to_column_groupbys(self, func):
2241+
""" return a pass thru """
2242+
return func(self)
21902243

21912244
class NDFrameGroupBy(GroupBy):
21922245

@@ -2486,6 +2539,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
24862539
elif hasattr(self.grouper, 'groupings'):
24872540
if len(self.grouper.groupings) > 1:
24882541
key_index = MultiIndex.from_tuples(keys, names=key_names)
2542+
24892543
else:
24902544
ping = self.grouper.groupings[0]
24912545
if len(keys) == ping.ngroups:
@@ -2498,8 +2552,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
24982552
# reorder the values
24992553
values = [values[i] for i in indexer]
25002554
else:
2555+
25012556
key_index = Index(keys, name=key_names[0])
25022557

2558+
# don't use the key indexer
2559+
if not self.as_index:
2560+
key_index = None
2561+
25032562
# make Nones an empty object
25042563
if com._count_not_none(*values) != len(values):
25052564
v = next(v for v in values if v is not None)
@@ -2569,7 +2628,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
25692628

25702629
# normally use vstack as its faster than concat
25712630
# and if we have mi-columns
2572-
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
2631+
if not _np_version_under1p7 or isinstance(v.index,MultiIndex) or key_index is None:
25732632
stacked_values = np.vstack([np.asarray(x) for x in values])
25742633
result = DataFrame(stacked_values,index=key_index,columns=index)
25752634
else:
@@ -2889,16 +2948,6 @@ def _apply_to_column_groupbys(self, func):
28892948
in self._iterate_column_groupbys()),
28902949
keys=self._selected_obj.columns, axis=1)
28912950

2892-
def ohlc(self):
2893-
"""
2894-
Compute sum of values, excluding missing values
2895-
2896-
For multiple groupings, the result index will be a MultiIndex
2897-
"""
2898-
return self._apply_to_column_groupbys(
2899-
lambda x: x._cython_agg_general('ohlc'))
2900-
2901-
29022951
from pandas.tools.plotting import boxplot_frame_groupby
29032952
DataFrameGroupBy.boxplot = boxplot_frame_groupby
29042953

pandas/core/series.py

+2-15
Original file line numberDiff line numberDiff line change
@@ -370,12 +370,12 @@ def __array__(self, result=None):
370370
""" the array interface, return my values """
371371
return self.values
372372

373-
def __array_wrap__(self, result):
373+
def __array_wrap__(self, result, copy=False):
374374
"""
375375
Gets called prior to a ufunc (and after)
376376
"""
377377
return self._constructor(result, index=self.index,
378-
copy=False).__finalize__(self)
378+
copy=copy).__finalize__(self)
379379

380380
def __contains__(self, key):
381381
return key in self.index
@@ -959,19 +959,6 @@ def iteritems(self):
959959
if compat.PY3: # pragma: no cover
960960
items = iteritems
961961

962-
# inversion
963-
def __neg__(self):
964-
values = self.values
965-
if values.dtype == np.bool_:
966-
arr = operator.inv(values)
967-
else:
968-
arr = operator.neg(values)
969-
return self._constructor(arr, self.index).__finalize__(self)
970-
971-
def __invert__(self):
972-
arr = operator.inv(self.values)
973-
return self._constructor(arr, self.index).__finalize__(self)
974-
975962
#----------------------------------------------------------------------
976963
# unbox reductions
977964

0 commit comments

Comments
 (0)