Skip to content

ENH/BUG: add count to grouper / ensure that grouper keys are not included in the returned #7000

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 29, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ API Changes
validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`)
- Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the
``data`` argument (:issue:`5357`)
- groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
as its already the index

Deprecations
~~~~~~~~~~~~
Expand Down
19 changes: 18 additions & 1 deletion doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,29 @@ API changes

.. ipython:: python

DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
g.nth(0) # can also use negative ints

g.nth(0, dropna='any') # similar to old behaviour

groupby will now not return the grouped column for non-cython functions (:issue:`5610`),
as its already the index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this add it back in as a col if you do as_index=False?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'll add that as an example


.. ipython:: python

df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
g = df.groupby('A')
g.count()
g.describe()

passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0)

df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B'])
g = df.groupby('A',as_index=False)
g.count()
g.describe()

- Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping
by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)

Expand Down
10 changes: 6 additions & 4 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,13 @@ def describe(self):
"""
# Hack?
from pandas.core.frame import DataFrame
grouped = DataFrame(self.labels).groupby(0)
counts = grouped.count().values.squeeze()
counts = DataFrame({
'labels' : self.labels,
'values' : self.labels }
).groupby('labels').count().squeeze().values
freqs = counts / float(counts.sum())
return DataFrame.from_dict({
return DataFrame({
'counts': counts,
'freqs': freqs,
'levels': self.levels
}).set_index('levels')
}).set_index('levels')
22 changes: 13 additions & 9 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,11 +611,19 @@ def __neg__(self):
arr = operator.inv(values)
else:
arr = operator.neg(values)
return self._wrap_array(arr, self.axes, copy=False)
return self.__array_wrap__(arr)

def __invert__(self):
arr = operator.inv(_values_from_object(self))
return self._wrap_array(arr, self.axes, copy=False)
try:
arr = operator.inv(_values_from_object(self))
return self.__array_wrap__(arr)
except:

# inv fails with 0 len
if not np.prod(self.shape):
return self

raise

def equals(self, other):
"""
Expand Down Expand Up @@ -707,15 +715,11 @@ def __abs__(self):
#----------------------------------------------------------------------
# Array Interface

def _wrap_array(self, arr, axes, copy=False):
d = self._construct_axes_dict_from(self, axes, copy=copy)
return self._constructor(arr, **d).__finalize__(self)

def __array__(self, dtype=None):
return _values_from_object(self)

def __array_wrap__(self, result):
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
def __array_wrap__(self, result, copy=False):
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=copy)
return self._constructor(result, **d).__finalize__(self)

# ideally we would define this to avoid the getattr checks, but
Expand Down
107 changes: 78 additions & 29 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,23 @@ def _selection_list(self):
return [self._selection]
return self._selection

@cache_readonly
def _selected_obj(self):

if self._selection is None or isinstance(self.obj, Series):
return self.obj
else:
return self.obj[self._selection]

def _set_selection_from_grouper(self):
""" we may need create a selection if we have non-level groupers """
grp = self.grouper
if self._selection is None and self.as_index and getattr(grp,'groupings',None) is not None:
ax = self.obj._info_axis
groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
if len(groupers):
self._selection = (ax-Index(groupers)).tolist()

def _local_dir(self):
return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))

Expand All @@ -453,7 +470,6 @@ def __getattr__(self, attr):
return object.__getattribute__(self, attr)
if attr in self.obj:
return self[attr]

if hasattr(self.obj, attr):
return self._make_wrapper(attr)

Expand All @@ -472,6 +488,10 @@ def _make_wrapper(self, name):
type(self).__name__))
raise AttributeError(msg)

# need to setup the selection
# as are not passed directly but in the grouper
self._set_selection_from_grouper()

f = getattr(self._selected_obj, name)
if not isinstance(f, types.MethodType):
return self.apply(lambda self: getattr(self, name))
Expand Down Expand Up @@ -503,7 +523,19 @@ def curried(x):
try:
return self.apply(curried_with_axis)
except Exception:
return self.apply(curried)
try:
return self.apply(curried)
except Exception:

# related to : GH3688
# try item-by-item
# this can be called recursively, so need to raise ValueError if
# we don't have this method to indicated to aggregate to
# mark this column as an error
try:
return self._aggregate_item_by_item(name, *args, **kwargs)
except (AttributeError):
raise ValueError

return wrapper

Expand Down Expand Up @@ -624,6 +656,7 @@ def mean(self):
except GroupByError:
raise
except Exception: # pragma: no cover
self._set_selection_from_grouper()
f = lambda x: x.mean(axis=self.axis)
return self._python_agg_general(f)

Expand All @@ -639,6 +672,7 @@ def median(self):
raise
except Exception: # pragma: no cover

self._set_selection_from_grouper()
def f(x):
if isinstance(x, np.ndarray):
x = Series(x)
Expand All @@ -655,6 +689,7 @@ def std(self, ddof=1):
if ddof == 1:
return self._cython_agg_general('std')
else:
self._set_selection_from_grouper()
f = lambda x: x.std(ddof=ddof)
return self._python_agg_general(f)

Expand All @@ -667,15 +702,26 @@ def var(self, ddof=1):
if ddof == 1:
return self._cython_agg_general('var')
else:
self._set_selection_from_grouper()
f = lambda x: x.var(ddof=ddof)
return self._python_agg_general(f)

def size(self):
"""
Compute group sizes

"""
return self.grouper.size()

def count(self, axis=0):
"""
Number of non-null items in each group.
axis : axis number, default 0
the grouping axis
"""
self._set_selection_from_grouper()
return self._python_agg_general(lambda x: notnull(x).sum(axis=axis)).astype('int64')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a much simpler way to solve the upcasting!


sum = _groupby_function('sum', 'add', np.sum)
prod = _groupby_function('prod', 'prod', np.prod)
min = _groupby_function('min', 'min', np.min, numeric_only=False)
Expand All @@ -685,14 +731,14 @@ def size(self):
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
_convert=True)


def ohlc(self):
"""
Compute sum of values, excluding missing values

For multiple groupings, the result index will be a MultiIndex

"""
return self._cython_agg_general('ohlc')
return self._apply_to_column_groupbys(
lambda x: x._cython_agg_general('ohlc'))

def nth(self, n, dropna=None):
"""
Expand Down Expand Up @@ -888,13 +934,6 @@ def _cumcount_array(self, arr=None, **kwargs):
cumcounts[v] = arr[len(v)-1::-1]
return cumcounts

@cache_readonly
def _selected_obj(self):
if self._selection is None or isinstance(self.obj, Series):
return self.obj
else:
return self.obj[self._selection]

def _index_with_as_index(self, b):
"""
Take boolean mask of index to be returned from apply, if as_index=True
Expand Down Expand Up @@ -990,12 +1029,23 @@ def _concat_objects(self, keys, values, not_indexed_same=False):
result = result.reindex(ax)
else:
result = result.reindex_axis(ax, axis=self.axis)
elif self.group_keys and self.as_index:
group_keys = keys
group_levels = self.grouper.levels
group_names = self.grouper.names
result = concat(values, axis=self.axis, keys=group_keys,
levels=group_levels, names=group_names)

elif self.group_keys:

if self.as_index:

# possible MI return case
group_keys = keys
group_levels = self.grouper.levels
group_names = self.grouper.names
result = concat(values, axis=self.axis, keys=group_keys,
levels=group_levels, names=group_names)
else:

# GH5610, returns a MI, with the first level being a
# range index
keys = list(range(len(values)))
result = concat(values, axis=self.axis, keys=keys)
else:
result = concat(values, axis=self.axis)

Expand Down Expand Up @@ -2187,6 +2237,9 @@ def true_and_notnull(x, *args, **kwargs):
filtered = self._apply_filter(indices, dropna)
return filtered

def _apply_to_column_groupbys(self, func):
""" return a pass thru """
return func(self)

class NDFrameGroupBy(GroupBy):

Expand Down Expand Up @@ -2486,6 +2539,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
elif hasattr(self.grouper, 'groupings'):
if len(self.grouper.groupings) > 1:
key_index = MultiIndex.from_tuples(keys, names=key_names)

else:
ping = self.grouper.groupings[0]
if len(keys) == ping.ngroups:
Expand All @@ -2498,8 +2552,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
# reorder the values
values = [values[i] for i in indexer]
else:

key_index = Index(keys, name=key_names[0])

# don't use the key indexer
if not self.as_index:
key_index = None

# make Nones an empty object
if com._count_not_none(*values) != len(values):
v = next(v for v in values if v is not None)
Expand Down Expand Up @@ -2569,7 +2628,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

# normally use vstack as its faster than concat
# and if we have mi-columns
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
if not _np_version_under1p7 or isinstance(v.index,MultiIndex) or key_index is None:
stacked_values = np.vstack([np.asarray(x) for x in values])
result = DataFrame(stacked_values,index=key_index,columns=index)
else:
Expand Down Expand Up @@ -2889,16 +2948,6 @@ def _apply_to_column_groupbys(self, func):
in self._iterate_column_groupbys()),
keys=self._selected_obj.columns, axis=1)

def ohlc(self):
"""
Compute sum of values, excluding missing values

For multiple groupings, the result index will be a MultiIndex
"""
return self._apply_to_column_groupbys(
lambda x: x._cython_agg_general('ohlc'))


from pandas.tools.plotting import boxplot_frame_groupby
DataFrameGroupBy.boxplot = boxplot_frame_groupby

Expand Down
17 changes: 2 additions & 15 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,12 +370,12 @@ def __array__(self, result=None):
""" the array interface, return my values """
return self.values

def __array_wrap__(self, result):
def __array_wrap__(self, result, copy=False):
"""
Gets called prior to a ufunc (and after)
"""
return self._constructor(result, index=self.index,
copy=False).__finalize__(self)
copy=copy).__finalize__(self)

def __contains__(self, key):
return key in self.index
Expand Down Expand Up @@ -959,19 +959,6 @@ def iteritems(self):
if compat.PY3: # pragma: no cover
items = iteritems

# inversion
def __neg__(self):
values = self.values
if values.dtype == np.bool_:
arr = operator.inv(values)
else:
arr = operator.neg(values)
return self._constructor(arr, self.index).__finalize__(self)

def __invert__(self):
arr = operator.inv(self.values)
return self._constructor(arr, self.index).__finalize__(self)

#----------------------------------------------------------------------
# unbox reductions

Expand Down
Loading