From 93672e69b0fcb69bc17e7c64c6afd770aaec3308 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 12 Nov 2016 12:01:19 -0500 Subject: [PATCH 1/2] ENH: add Series & DataFrame .agg/.aggregate to provide convienent function application that mimics the groupby(..).agg/.aggregate interface .apply is now a synonym for .agg, and will accept dict/list-likes for aggregations CLN: rename .name attr -> ._selection_name from SeriesGroupby for compat (didn't exist on DataFrameGroupBy) resolves conflicts w.r.t. setting .name on a groupby object closes #1623 closes #14464 custom .describe closes #14483 closes #15015 closes #7014 --- doc/source/api.rst | 4 + doc/source/basics.rst | 242 +++++++++++++++++++++++++- doc/source/computation.rst | 4 +- doc/source/groupby.rst | 4 +- doc/source/timeseries.rst | 6 +- doc/source/whatsnew/v0.20.0.txt | 68 +++++++- pandas/core/base.py | 123 +++++++++++-- pandas/core/frame.py | 70 +++++++- pandas/core/generic.py | 85 ++++++++- pandas/core/groupby.py | 39 +++-- pandas/core/series.py | 55 ++++++ pandas/tests/frame/test_apply.py | 169 ++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/series/test_apply.py | 168 +++++++++++++++++- pandas/tseries/tests/test_resample.py | 6 +- pandas/types/cast.py | 17 ++ 16 files changed, 996 insertions(+), 66 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 272dfe72eafe7..ee05532e0ae3a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -314,6 +314,8 @@ Function application, GroupBy & Window :toctree: generated/ Series.apply + Series.aggregate + Series.transform Series.map Series.groupby Series.rolling @@ -833,6 +835,8 @@ Function application, GroupBy & Window DataFrame.apply DataFrame.applymap + DataFrame.aggregate + DataFrame.transform DataFrame.groupby DataFrame.rolling DataFrame.expanding diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 2e8abe0a5c329..7d4a776203a2d 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -702,7 +702,8 @@ on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise. 1. `Tablewise Function Application`_: :meth:`~DataFrame.pipe` 2. `Row or Column-wise Function Application`_: :meth:`~DataFrame.apply` -3. Elementwise_ function application: :meth:`~DataFrame.applymap` +3. `Aggregation API`_: :meth:`~DataFrame.agg` and :meth:`~DataFrame.transform` +4. `Applying Elementwise Functions`_: :meth:`~DataFrame.applymap` .. _basics.pipe: @@ -778,6 +779,13 @@ statistics methods, take an optional ``axis`` argument: df.apply(np.cumsum) df.apply(np.exp) +``.apply()`` will also dispatch on a string method name. + +.. ipython:: python + + df.apply('mean') + df.apply('mean', axis=1) + Depending on the return type of the function passed to :meth:`~DataFrame.apply`, the result will either be of lower dimension or the same dimension. @@ -827,16 +835,234 @@ set to True, the passed function will instead receive an ndarray object, which has positive performance implications if you do not need the indexing functionality. -.. seealso:: +.. _basics.aggregate: + +Aggregation API +~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. +This API is similar across pandas objects, :ref:`groupby aggregates `, +:ref:`window functions `, and the :ref:`resample API `. + +We will use a similar starting frame from above. + +.. ipython:: python + + tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) + tsdf.iloc[3:7] = np.nan + tsdf + +Using a single function is equivalent to ``.apply``; You can also pass named methods as strings. +This will return a Series of the output. + +.. ipython:: python + + tsdf.agg(np.sum) + + tsdf.agg('sum') + + # these are equivalent to a ``.sum()`` because we are aggregating on a single function + tsdf.sum() + +On a Series this will result in a scalar value + +.. ipython:: python + + tsdf.A.agg('sum') + + +Aggregating multiple functions at once +++++++++++++++++++++++++++++++++++++++ + +You can pass arguments as a list. The results of each of the passed functions will be a row in the resultant DataFrame. +These are naturally named from the aggregation function. + +.. ipython:: python + + tsdf.agg(['sum']) + +Multiple functions yield multiple rows. - The section on :ref:`GroupBy ` demonstrates related, flexible - functionality for grouping by some criterion, applying, and combining the - results into a Series, DataFrame, etc. +.. ipython:: python + + tsdf.agg(['sum', 'mean']) + +On a Series, multiple functions return a Series, indexed by the function names. + +.. ipython:: python + + tsdf.A.agg(['sum', 'mean']) + + +Aggregating with a dict of functions +++++++++++++++++++++++++++++++++++++ + +Passing a dictionary of column name to function or list of functions, to ``DataFame.agg`` +allows you to customize which functions are applied to which columns. + +.. ipython:: python + + tsdf.agg({'A': 'mean', 'B': 'sum'}) + +Passing a list-like will generate a DataFrame output. You will get a matrix-like output +of all of the aggregators; some may be missing values. + +.. ipython:: python + + tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'}) -.. _Elementwise: +For a Series, you can pass a dict. You will get back a MultiIndex Series; The outer level will +be the keys, the inner the name of the functions. + +.. ipython:: python + + tsdf.A.agg({'foo' : ['sum', 'mean']}) + +Alternatively, using multiple dictionaries, you can have renamed elements with the aggregation + +.. ipython:: python + + tsdf.A.agg({'foo' : 'sum', 'bar': 'mean'}) + +Multiple keys will yield a MultiIndex Series. The outer level will be the keys, the inner +the names of the functions. + +.. ipython:: python + + tsdf.A.agg({'foo' : ['sum', 'mean'], 'bar': ['min', 'max', lambda x: x.sum()+1]}) + +.. _basics.aggregation.mixed_dtypes: + +Mixed Dtypes +++++++++++++ + +When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid +aggregations. This is similiar to how groupby ``.agg`` works. + +.. ipython:: python -Applying elementwise Python functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdf = pd.DataFrame({'A': [1, 2, 3], + 'B': [1., 2., 3.], + 'C': ['foo', 'bar', 'baz'], + 'D': pd.date_range('20130101', periods=3)}) + mdf.dtypes + +.. ipython:: python + + mdf.agg(['min', 'sum']) + +.. _basics.aggregation.custom_describe: + +Custom describe ++++++++++++++++ + +With ``.agg()`` is it possible to easily create a custom describe function, similar +to the built in :ref:`describe function `. + +.. ipython:: python + + from functools import partial + + q_25 = partial(pd.Series.quantile, q=0.25) + q_25.__name__ = '25%' + q_75 = partial(pd.Series.quantile, q=0.75) + q_75.__name__ = '75%' + + tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max']) + +.. _basics.transform: + +Transform API +~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +The ``transform`` method returns an object that is indexed the same (same size) +as the original. This API allows you to provide *multiple* operations at the same +time rather than one-by-one. Its api is quite similar to the ``.agg`` API. + +Use a similar frame to the above sections. + +.. ipython:: python + + tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) + tsdf.iloc[3:7] = np.nan + tsdf + +Transform the entire frame. Transform allows functions to input as a numpy function, string +function name and user defined function. + +.. ipython:: python + + tsdf.transform(np.abs) + tsdf.transform('abs') + tsdf.transform(lambda x: x.abs()) + +Since this is a single function, this is equivalent to a ufunc application + +.. ipython:: python + + np.abs(tsdf) + +Passing a single function to ``.transform()`` with a Series will yield a single Series in return. + +.. ipython:: python + + tsdf.A.transform(np.abs) + + +Transform with multiple functions ++++++++++++++++++++++++++++++++++ + +Passing multiple functions will yield a column multi-indexed DataFrame. +The first level will be the original frame column names; the second level +will be the names of the transforming functions. + +.. ipython:: python + + tsdf.transform([np.abs, lambda x: x+1]) + +Passing multiple functions to a Series will yield a DataFrame. The +resulting column names will be the transforming functions. + +.. ipython:: python + + tsdf.A.transform([np.abs, lambda x: x+1]) + + +Transforming with a dict of functions ++++++++++++++++++++++++++++++++++++++ + + +Passing a dict of functions will will allow selective transforming per column. + +.. ipython:: python + + tsdf.transform({'A': np.abs, 'B': lambda x: x+1}) + +Passing a dict of lists will generate a multi-indexed DataFrame with these +selective transforms. + +.. ipython:: python + + tsdf.transform({'A': np.abs, 'B': [lambda x: x+1, 'sqrt']}) + +On a Series, passing a dict allows renaming as in ``.agg()`` + +.. ipython:: python + + tsdf.A.transform({'foo': np.abs}) + tsdf.A.transform({'foo': np.abs, 'bar': [lambda x: x+1, 'sqrt']}) + + +.. _basics.elementwise: + +Applying Elementwise Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Since not all functions can be vectorized (accept NumPy arrays and return another array or value), the methods :meth:`~DataFrame.applymap` on DataFrame diff --git a/doc/source/computation.rst b/doc/source/computation.rst index a19a56f6f1905..730c10e3393b1 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -565,7 +565,9 @@ Aggregation ----------- Once the ``Rolling``, ``Expanding`` or ``EWM`` objects have been created, several methods are available to -perform multiple computations on the data. This is very similar to a ``.groupby(...).agg`` seen :ref:`here `. +perform multiple computations on the data. These operations are similar to the :ref:`aggregating API `, +:ref:`groupby aggregates `, and :ref:`resample API `. + .. ipython:: python diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 45af02cb60b25..d72ab7a9b121b 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -439,7 +439,9 @@ Aggregation ----------- Once the GroupBy object has been created, several methods are available to -perform a computation on the grouped data. +perform a computation on the grouped data. These operations are similar to the +:ref:`aggregating API `, :ref:`window functions `, +and :ref:`resample API `. An obvious one is aggregation via the ``aggregate`` or equivalently ``agg`` method: diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index e09d240ed91b7..5543e36a7153e 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1470,11 +1470,13 @@ We can instead only resample those groups where we have points as follows: ts.groupby(partial(round, freq='3T')).sum() +.. _timeseries.aggregate: + Aggregation ~~~~~~~~~~~ -Similar to :ref:`groupby aggregates ` and the :ref:`window functions `, a ``Resampler`` can be selectively -resampled. +Similar to the :ref:`aggregating API `, :ref:`groupby aggregates `, and :ref:`window functions `, +a ``Resampler`` can be selectively resampled. Resampling a ``DataFrame``, the default will be to act on all columns with the same function. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index bbf528a50e1bb..0c8125ca9a802 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -9,6 +9,8 @@ users upgrade to this version. Highlights include: +- new ``.agg()`` API for Series/DataFrame similar to the groupby-rolling-resample API's, see :ref:`here ` +- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -22,9 +24,73 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ -- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. +.. _whatsnew_0200.enhancements.agg: + +``agg`` API +^^^^^^^^^^^ + +Series & DataFrame have been enhanced to support the aggregation API. This is an already familiar API that +is supported for groupby, windows operations, and resampling. This allows one to express, possibly multiple +aggregation operations in a single concise way by using ``.agg()`` and ``.transform()``. The +full documentation is :ref:`here `` (:issue:`1623`) + +Here is a sample + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], + index=pd.date_range('1/1/2000', periods=10)) + df.iloc[3:7] = np.nan + df + +One can operate using string function names, callables, lists, or dictionaries of these. + +Using a single function is equivalent to ``.apply``. + +.. ipython:: python + + df.agg('sum') + +Multiple functions in lists. + +.. ipython:: python + + df.agg(['sum', 'min']) +Dictionaries to provide the ability to selective calculation. + +.. ipython:: python + + df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) + +When operating on a Series, passing a dictionry allows one to rename multiple +function aggregates; this will return a MultiIndexed Series. The outer level +are the keys, the inner are the names of the functions. + +.. ipython:: python + + df.A.agg({'foo':['sum', 'min'], 'bar' : ['count','max']}) + +The API also supports a ``.transform()`` function to provide for broadcasting results. + +.. ipython:: python + + df.transform(['abs', lambda x: x-x.min()]) + +When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid +aggregations. This is similiar to how groupby ``.agg`` works. (:issue:`15015`) + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': [1., 2., 3.], + 'C': ['foo', 'bar', 'baz'], + 'D': pd.date_range('20130101', periods=3)}) + df.dtypes + +.. ipython:: python + df.agg(['min', 'sum']) .. _whatsnew_0200.enhancements.dataio_dtype: diff --git a/pandas/core/base.py b/pandas/core/base.py index 49e43a60403ca..6ef866598c56c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -289,7 +289,9 @@ class SelectionMixin(object): } @property - def name(self): + def _selection_name(self): + """ return a name for myself; this would ideally be the 'name' property, but + we cannot conflict with the Series.name property which can be set """ if self._selection is None: return None # 'result' else: @@ -404,6 +406,26 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate + def _try_aggregate_string_function(self, arg, *args, **kwargs): + """ + if arg is a string, then try to operate on it: + - try to find a function on ourselves + - try to find a numpy function + - raise + + """ + assert isinstance(arg, compat.string_types) + + f = getattr(self, arg, None) + if f is not None: + return f(*args, **kwargs) + + f = getattr(np, arg, None) + if f is not None: + return f(self, *args, **kwargs) + + raise ValueError("{} is an unknown string function".format(arg)) + def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators @@ -427,14 +449,19 @@ def _aggregate(self, arg, *args, **kwargs): is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False + _axis = kwargs.pop('_axis', None) + if _axis is None: + _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) + if isinstance(arg, compat.string_types): - return getattr(self, arg)(*args, **kwargs), None + return self._try_aggregate_string_function(arg, *args, + **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict - if self.axis != 0: # pragma: no cover + if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj @@ -554,32 +581,74 @@ def _agg(arg, func): result = _agg(arg, _agg_2dim) # combine results + + def is_any_series(): + # return a boolean if we have *any* nested series + return any([isinstance(r, ABCSeries) + for r in compat.itervalues(result)]) + + def is_any_frame(): + # return a boolean if we have *any* nested series + return any([isinstance(r, ABCDataFrame) + for r in compat.itervalues(result)]) + if isinstance(result, list): - result = concat(result, keys=keys, axis=1) - elif isinstance(list(compat.itervalues(result))[0], - ABCDataFrame): - result = concat([result[k] for k in keys], keys=keys, axis=1) - else: - from pandas import DataFrame + return concat(result, keys=keys, axis=1), True + + elif is_any_frame(): + # we have a dict of DataFrames + # return a MI DataFrame + + return concat([result[k] for k in keys], + keys=keys, axis=1), True + + elif isinstance(self, ABCSeries) and is_any_series(): + + # we have a dict of Series + # return a MI Series + try: + result = concat(result) + except TypeError: + # we want to give a nice error here if + # we have non-same sized objects, so + # we don't automatically broadcast + + raise ValueError("cannot perform both aggregation " + "and transformation operations " + "simultaneously") + + return result, True + + # fall thru + from pandas import DataFrame, Series + try: result = DataFrame(result) + except ValueError: + + # we have a dict of scalars + result = Series(result, + name=getattr(self, 'name', None)) return result, True - elif hasattr(arg, '__iter__'): - return self._aggregate_multiple_funcs(arg, _level=_level), None + elif is_list_like(arg) and arg not in compat.string_types: + # we require a list, but not an 'str' + return self._aggregate_multiple_funcs(arg, + _level=_level, + _axis=_axis), None else: result = None - cy_func = self._is_cython_func(arg) - if cy_func and not args and not kwargs: - return getattr(self, cy_func)(), None + f = self._is_cython_func(arg) + if f and not args and not kwargs: + return getattr(self, f)(), None # caller can react return result, True - def _aggregate_multiple_funcs(self, arg, _level): + def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.tools.merge import concat - if self.axis != 0: + if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: @@ -614,10 +683,30 @@ def _aggregate_multiple_funcs(self, arg, _level): keys.append(col) except (TypeError, DataError): pass + except ValueError: + # cannot aggregate + continue except SpecificationError: raise - return concat(results, keys=keys, axis=1) + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + return concat(results, keys=keys, axis=1) + except TypeError: + + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + + from pandas.types.cast import _is_nested_object + from pandas import Series + result = Series(results, index=keys, name=self.name) + if _is_nested_object(result): + raise ValueError("cannot combine transform and " + "aggregation operations") + return result def _shallow_copy(self, obj=None, obj_type=None, **kwargs): """ return a new object with the replacement attributes """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d96fb094f5d5c..0e61d2f6d5702 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4029,6 +4029,42 @@ def diff(self, periods=1, axis=0): # ---------------------------------------------------------------------- # Function application + def _gotitem(self, key, ndim, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + if subset is None: + subset = self + + # TODO: _shallow_copy(subset)? + return self[key] + + @Appender(_shared_docs['aggregate'] % _shared_doc_kwargs) + def aggregate(self, func, axis=0, *args, **kwargs): + axis = self._get_axis_number(axis) + + # TODO: flipped axis + result = None + if axis == 0: + try: + result, how = self._aggregate(func, axis=0, *args, **kwargs) + except TypeError: + pass + if result is None: + return self.apply(func, axis=axis, args=args, **kwargs) + return result + + agg = aggregate + def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds): """ @@ -4084,22 +4120,35 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, See also -------- DataFrame.applymap: For elementwise operations + DataFrame.agg: only perform aggregating type operations + DataFrame.transform: only perform transformating type operations Returns ------- applied : Series or DataFrame """ axis = self._get_axis_number(axis) - if kwds or args and not isinstance(func, np.ufunc): + ignore_failures = kwds.pop('ignore_failures', False) + + # dispatch to agg + if axis == 0 and isinstance(func, (list, dict)): + return self.aggregate(func, axis=axis, *args, **kwds) + + if len(self.columns) == 0 and len(self.index) == 0: + return self._apply_empty_result(func, axis, reduce, *args, **kwds) + # if we are a string, try to dispatch + if isinstance(func, compat.string_types): + if axis: + kwds['axis'] = axis + return getattr(self, func)(*args, **kwds) + + if kwds or args and not isinstance(func, np.ufunc): def f(x): return func(x, *args, **kwds) else: f = func - if len(self.columns) == 0 and len(self.index) == 0: - return self._apply_empty_result(func, axis, reduce, *args, **kwds) - if isinstance(f, np.ufunc): with np.errstate(all='ignore'): results = f(self.values) @@ -4116,7 +4165,10 @@ def f(x): else: if reduce is None: reduce = True - return self._apply_standard(f, axis, reduce=reduce) + return self._apply_standard( + f, axis, + reduce=reduce, + ignore_failures=ignore_failures) else: return self._apply_broadcast(f, axis) @@ -4920,7 +4972,13 @@ def f(x): # this can end up with a non-reduction # but not always. if the types are mixed # with datelike then need to make sure a series - result = self.apply(f, reduce=False) + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + result = self.apply(f, reduce=False, + ignore_failures=True) if result.ndim == self.ndim: result = result.iloc[0] return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8ce4c4b00454b..7c6db54a67bb1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -33,7 +33,7 @@ SettingWithCopyError, SettingWithCopyWarning, AbstractMethodError) -from pandas.core.base import PandasObject +from pandas.core.base import PandasObject, SelectionMixin from pandas.core.index import (Index, MultiIndex, _ensure_index, InvalidIndexError) import pandas.core.indexing as indexing @@ -91,7 +91,7 @@ def _single_replace(self, to_replace, method, inplace, limit): return result -class NDFrame(PandasObject): +class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a size-mutable, labeled data structure @@ -428,6 +428,16 @@ def size(self): """number of elements in the NDFrame""" return np.prod(self.shape) + @property + def _selected_obj(self): + """ internal compat with SelectionMixin """ + return self + + @property + def _obj_with_exclusions(self): + """ internal compat with SelectionMixin """ + return self + def _expand_axes(self, key): new_axes = [] for k, ax in zip(key, self.axes): @@ -2764,6 +2774,66 @@ def pipe(self, func, *args, **kwargs): else: return func(self, *args, **kwargs) + _shared_docs['aggregate'] = (""" + Aggregate using input function or dict of {column -> + function} + + .. versionadded:: 0.20.0 + + Parameters + ---------- + func : callable, string, dictionary, or list of string/callables + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. If + passed a dict, the keys must be DataFrame column names. + + Accepted Combinations are: + - string function name + - function + - list of functions + - dict of column names -> functions (or list of functions) + + Notes + ----- + Numpy functions mean/median/prod/sum/std/var are special cased so the + default behavior is applying the function along axis=0 + (e.g., np.mean(arr_2d, axis=0)) as opposed to + mimicking the default Numpy behavior (e.g., np.mean(arr_2d)). + + Returns + ------- + aggregated : %(klass)s + + See also + -------- + """) + + _shared_docs['transform'] = (""" + Call function producing a like-indexed %(klass)s + and return a %(klass)s with the transformed values` + + .. versionadded:: 0.20.0 + + Parameters + ---------- + func : callable, string, dictionary, or list of string/callables + To apply to column + + Accepted Combinations are: + - string function name + - function + - list of functions + - dict of column names -> functions (or list of functions) + + Examples + -------- + >>> df.transform(lambda x: (x - x.mean()) / x.std()) + + Returns + ------- + transformed : %(klass)s + """) + # ---------------------------------------------------------------------- # Attribute access @@ -5596,6 +5666,17 @@ def ewm(self, com=None, span=None, halflife=None, alpha=None, cls.ewm = ewm + @Appender(_shared_docs['transform'] % _shared_doc_kwargs) + def transform(self, func, *args, **kwargs): + result = self.agg(func, *args, **kwargs) + if is_scalar(result) or len(result) != len(self): + raise ValueError("transforms cannot produce " + "aggregated results") + + return result + + cls.transform = transform + def _doc_parms(cls): """Return a tuple of the doc parms.""" diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7eba32b4932d0..158693ffead78 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -703,7 +703,7 @@ def _python_apply_general(self, f): not_indexed_same=mutated or self.mutated) def _iterate_slices(self): - yield self.name, self._selected_obj + yield self._selection_name, self._selected_obj def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) @@ -896,9 +896,9 @@ def reset_identity(values): result = concat(values, axis=self.axis) if (isinstance(result, Series) and - getattr(self, 'name', None) is not None): + getattr(self, '_selection_name', None) is not None): - result.name = self.name + result.name = self._selection_name return result @@ -2597,7 +2597,7 @@ class SeriesGroupBy(GroupBy): exec(_def_str) @property - def name(self): + def _selection_name(self): """ since we are a series, we by definition only have a single name, but may be the result of a selection or @@ -2740,12 +2740,12 @@ def _aggregate_multiple_funcs(self, arg, _level): def _wrap_output(self, output, index, names=None): """ common agg/transform wrapping logic """ - output = output[self.name] + output = output[self._selection_name] if names is not None: return DataFrame(output, index=index, columns=names) else: - name = self.name + name = self._selection_name if name is None: name = self._selected_obj.name return Series(output, index=index, name=name) @@ -2763,7 +2763,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self.name, index=keys) + return Series([], name=self._selection_name, index=keys) def _get_index(): if self.grouper.nkeys > 1: @@ -2776,7 +2776,7 @@ def _get_index(): # GH #823 index = _get_index() result = DataFrame(values, index=index).stack() - result.name = self.name + result.name = self._selection_name return result if isinstance(values[0], (Series, dict)): @@ -2788,7 +2788,8 @@ def _get_index(): not_indexed_same=not_indexed_same) else: # GH #6265 - return Series(values, index=_get_index(), name=self.name) + return Series(values, index=_get_index(), + name=self._selection_name) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -2964,7 +2965,7 @@ def nunique(self, dropna=True): return Series(res, index=ri, - name=self.name) + name=self._selection_name) @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @@ -3028,7 +3029,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, # multi-index components labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] - names = self.grouper.names + [self.name] + names = self.grouper.names + [self._selection_name] if dropna: mask = labels[-1] != -1 @@ -3063,7 +3064,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if is_integer_dtype(out): out = _ensure_int64(out) - return Series(out, index=mi, name=self.name) + return Series(out, index=mi, name=self._selection_name) # for compat. with algos.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -3094,7 +3095,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if is_integer_dtype(out): out = _ensure_int64(out) - return Series(out, index=mi, name=self.name) + return Series(out, index=mi, name=self._selection_name) def count(self): """ Compute count of group, excluding missing values """ @@ -3107,7 +3108,7 @@ def count(self): return Series(out, index=self.grouper.result_index, - name=self.name, + name=self._selection_name, dtype='int64') def _apply_to_column_groupbys(self, func): @@ -3217,7 +3218,7 @@ def aggregate(self, arg, *args, **kwargs): try: assert not args and not kwargs result = self._aggregate_multiple_funcs( - [arg], _level=_level) + [arg], _level=_level, _axis=self.axis) result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name) @@ -3448,7 +3449,8 @@ def first_non_None_value(values): except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall # through to the outer else caluse - return Series(values, index=key_index, name=self.name) + return Series(values, index=key_index, + name=self._selection_name) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -3471,8 +3473,9 @@ def first_non_None_value(values): # only coerce dates if we find at least 1 datetime coerce = True if any([isinstance(x, Timestamp) for x in values]) else False - # self.name not passed through to Series as the result - # should not take the name of original selection of columns + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns return (Series(values, index=key_index) ._convert(datetime=True, coerce=coerce)) diff --git a/pandas/core/series.py b/pandas/core/series.py index f656d72296e3a..2a15aa86201a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2178,6 +2178,49 @@ def map_f(values, f): return self._constructor(new_values, index=self.index).__finalize__(self) + def _gotitem(self, key, ndim, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + return self + + @Appender(generic._shared_docs['aggregate'] % _shared_doc_kwargs) + def aggregate(self, func, axis=0, *args, **kwargs): + axis = self._get_axis_number(axis) + result, how = self._aggregate(func, *args, **kwargs) + if result is None: + + # we can be called from an inner function which + # passes this meta-data + kwargs.pop('_axis', None) + kwargs.pop('_level', None) + + # try a regular apply, this evaluates lambdas + # row-by-row; however if the lambda is expected a Series + # expression, e.g.: lambda x: x-x.quantile(0.25) + # this will fail, so we can try a vectorized evaluation + + # we cannot FIRST try the vectorized evaluation, becuase + # then .agg and .apply would have different semantics if the + # operation is actually defined on the Series, e.g. str + try: + result = self.apply(func, *args, **kwargs) + except (ValueError, AttributeError, TypeError): + result = func(self, *args, **kwargs) + + return result + + agg = aggregate + def apply(self, func, convert_dtype=True, args=(), **kwds): """ Invoke function on values of Series. Can be ufunc (a NumPy function @@ -2201,6 +2244,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): See also -------- Series.map: For element-wise operations + Series.agg: only perform aggregating type operations + Series.transform: only perform transformating type operations Examples -------- @@ -2277,6 +2322,15 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): return self._constructor(dtype=self.dtype, index=self.index).__finalize__(self) + # dispatch to agg + if isinstance(func, (list, dict)): + return self.aggregate(func, *args, **kwds) + + # if we are a string, try to dispatch + if isinstance(func, compat.string_types): + return self._try_aggregate_string_function(func, *args, **kwds) + + # handle ufuncs and lambdas if kwds or args and not isinstance(func, np.ufunc): f = lambda x: func(x, *args, **kwds) else: @@ -2286,6 +2340,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): if isinstance(f, np.ufunc): return f(self) + # row-wise access if is_extension_type(self.dtype): mapped = self._values.map(f) else: diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 9e68b7e76d78f..55b81f986983c 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -108,6 +108,17 @@ def test_apply_standard_nonunique(self): rs = df.T.apply(lambda s: s[0], axis=0) assert_series_equal(rs, xp) + def test_with_string_args(self): + + for arg in ['sum', 'mean', 'min', 'max', 'std']: + result = self.frame.apply(arg) + expected = getattr(self.frame, arg)() + tm.assert_series_equal(result, expected) + + result = self.frame.apply(arg, axis=1) + expected = getattr(self.frame, arg)(axis=1) + tm.assert_series_equal(result, expected) + def test_apply_broadcast(self): broadcasted = self.frame.apply(np.mean, broadcast=True) agged = self.frame.apply(np.mean) @@ -448,3 +459,161 @@ def test_apply_non_numpy_dtype(self): df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category') result = df.apply(lambda x: x) assert_frame_equal(result, df) + + +def zip_frames(*frames): + """ + take a list of frames, zip the columns together for each + assume that these all have the first frame columns + + return a new frame + """ + columns = frames[0].columns + zipped = [f[c] for c in columns for f in frames] + return pd.concat(zipped, axis=1) + + +class TestDataFrameAggregate(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_agg_transform(self): + + with np.errstate(all='ignore'): + + f_sqrt = np.sqrt(self.frame) + f_abs = np.abs(self.frame) + + # ufunc + result = self.frame.transform(np.sqrt) + expected = f_sqrt.copy() + assert_frame_equal(result, expected) + + result = self.frame.apply(np.sqrt) + assert_frame_equal(result, expected) + + result = self.frame.transform(np.sqrt) + assert_frame_equal(result, expected) + + # list-like + result = self.frame.apply([np.sqrt]) + expected = f_sqrt.copy() + expected.columns = pd.MultiIndex.from_product( + [self.frame.columns, ['sqrt']]) + assert_frame_equal(result, expected) + + result = self.frame.transform([np.sqrt]) + assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + expected = zip_frames(f_sqrt, f_abs) + expected.columns = pd.MultiIndex.from_product( + [self.frame.columns, ['sqrt', 'absolute']]) + result = self.frame.apply([np.sqrt, np.abs]) + assert_frame_equal(result, expected) + + result = self.frame.transform(['sqrt', np.abs]) + assert_frame_equal(result, expected) + + def test_transform_and_agg_err(self): + # cannot both transform and agg + def f(): + self.frame.transform(['max', 'min']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.frame.agg(['max', 'sqrt']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.frame.transform(['max', 'sqrt']) + self.assertRaises(ValueError, f) + + df = pd.DataFrame({'A': range(5), 'B': 5}) + + def f(): + with np.errstate(all='ignore'): + df.agg({'A': ['abs', 'sum'], 'B': ['mean', 'max']}) + + def test_demo(self): + # demonstration tests + df = pd.DataFrame({'A': range(5), 'B': 5}) + + result = df.agg(['min', 'max']) + expected = DataFrame({'A': [0, 4], 'B': [5, 5]}, + columns=['A', 'B'], + index=['min', 'max']) + tm.assert_frame_equal(result, expected) + + result = df.agg({'A': ['min', 'max'], 'B': ['sum', 'max']}) + expected = DataFrame({'A': [4.0, 0.0, np.nan], + 'B': [5.0, np.nan, 25.0]}, + columns=['A', 'B'], + index=['max', 'min', 'sum']) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + def test_agg_reduce(self): + # all reducers + expected = zip_frames(self.frame.mean().to_frame(), + self.frame.max().to_frame(), + self.frame.sum().to_frame()).T + expected.index = ['mean', 'max', 'sum'] + result = self.frame.agg(['mean', 'max', 'sum']) + assert_frame_equal(result, expected) + + # dict input with scalars + result = self.frame.agg({'A': 'mean', 'B': 'sum'}) + expected = Series([self.frame.A.mean(), self.frame.B.sum()], + index=['A', 'B']) + assert_series_equal(result.reindex_like(expected), expected) + + # dict input with lists + result = self.frame.agg({'A': ['mean'], 'B': ['sum']}) + expected = DataFrame({'A': Series([self.frame.A.mean()], + index=['mean']), + 'B': Series([self.frame.B.sum()], + index=['sum'])}) + assert_frame_equal(result.reindex_like(expected), expected) + + # dict input with lists with multiple + result = self.frame.agg({'A': ['mean', 'sum'], + 'B': ['sum', 'max']}) + expected = DataFrame({'A': Series([self.frame.A.mean(), + self.frame.A.sum()], + index=['mean', 'sum']), + 'B': Series([self.frame.B.sum(), + self.frame.B.max()], + index=['sum', 'max'])}) + assert_frame_equal(result.reindex_like(expected), expected) + + def test_nuiscance_columns(self): + + # GH 15015 + df = DataFrame({'A': [1, 2, 3], + 'B': [1., 2., 3.], + 'C': ['foo', 'bar', 'baz'], + 'D': pd.date_range('20130101', periods=3)}) + + result = df.agg('min') + expected = Series([1, 1., 'bar', pd.Timestamp('20130101')], + index=df.columns) + assert_series_equal(result, expected) + + result = df.agg(['min']) + expected = DataFrame([[1, 1., 'bar', pd.Timestamp('20130101')]], + index=['min'], columns=df.columns) + assert_frame_equal(result, expected) + + result = df.agg('sum') + expected = Series([6, 6., 'foobarbaz'], + index=['A', 'B', 'C']) + assert_series_equal(result, expected) + + result = df.agg(['sum']) + expected = DataFrame([[6, 6., 'foobarbaz']], + index=['sum'], columns=['A', 'B', 'C']) + assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e87b5d04271e8..3031922b85821 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5148,7 +5148,7 @@ def test_tab_completion(self): expected = set( ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', + 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'head', 'irow', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index ec7ffde344d31..d82c2b8ec4b10 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -1,13 +1,14 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 +from collections import OrderedDict import numpy as np import pandas as pd from pandas import (Index, Series, DataFrame, isnull) from pandas.compat import lrange from pandas import compat -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_series_equal, assert_frame_equal import pandas.util.testing as tm from .common import TestData @@ -25,16 +26,11 @@ def test_apply(self): import math assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) - # how to handle Series result, #2316 - result = self.ts.apply(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) - expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) - tm.assert_frame_equal(result, expected) - # empty series s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) + # check all metadata (GH 9322) self.assertIsNot(s, rs) self.assertIs(s.index, rs.index) @@ -66,6 +62,13 @@ def test_apply_dont_convert_dtype(self): result = s.apply(f, convert_dtype=False) self.assertEqual(result.dtype, object) + def test_with_string_args(self): + + for arg in ['sum', 'mean', 'min', 'max', 'std']: + result = self.ts.apply(arg) + expected = getattr(self.ts, arg)() + self.assertEqual(result, expected) + def test_apply_args(self): s = Series(['foo,bar']) @@ -139,6 +142,157 @@ def f(x): tm.assert_series_equal(result, exp) +class TestSeriesAggregate(TestData, tm.TestCase): + + _multiprocess_can_split_ = True + + def test_transform(self): + # transforming functions + + with np.errstate(all='ignore'): + + f_sqrt = np.sqrt(self.series) + f_abs = np.abs(self.series) + + # ufunc + result = self.series.transform(np.sqrt) + expected = f_sqrt.copy() + assert_series_equal(result, expected) + + result = self.series.apply(np.sqrt) + assert_series_equal(result, expected) + + # list-like + result = self.series.transform([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ['sqrt'] + assert_frame_equal(result, expected) + + result = self.series.transform([np.sqrt]) + assert_frame_equal(result, expected) + + result = self.series.transform(['sqrt']) + assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ['sqrt', 'absolute'] + result = self.series.apply([np.sqrt, np.abs]) + assert_frame_equal(result, expected) + + result = self.series.transform(['sqrt', 'abs']) + expected.columns = ['sqrt', 'abs'] + assert_frame_equal(result, expected) + + # dict, provide renaming + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ['foo', 'bar'] + expected = expected.unstack().rename('series') + + result = self.series.apply({'foo': np.sqrt, 'bar': np.abs}) + assert_series_equal(result.reindex_like(expected), expected) + + def test_transform_and_agg_error(self): + # we are trying to transform with an aggregator + def f(): + self.series.transform(['min', 'max']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.series.agg(['sqrt', 'max']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.series.transform(['sqrt', 'max']) + self.assertRaises(ValueError, f) + + def f(): + with np.errstate(all='ignore'): + self.series.agg({'foo': np.sqrt, 'bar': 'sum'}) + self.assertRaises(ValueError, f) + + def test_demo(self): + # demonstration tests + s = Series(range(6), dtype='int64', name='series') + + result = s.agg(['min', 'max']) + expected = Series([0, 5], index=['min', 'max'], name='series') + tm.assert_series_equal(result, expected) + + result = s.agg({'foo': 'min'}) + expected = Series([0], index=['foo'], name='series') + tm.assert_series_equal(result, expected) + + result = s.agg({'foo': ['min', 'max']}) + expected = DataFrame( + {'foo': [0, 5]}, + index=['min', 'max']).unstack().rename('series') + tm.assert_series_equal(result, expected) + + def test_multiple_aggregators_with_dict_api(self): + + s = Series(range(6), dtype='int64', name='series') + result = s.agg({'foo': ['min', 'max'], 'bar': ['sum', 'mean']}) + + expected = DataFrame( + {'foo': [5.0, np.nan, 0.0, np.nan], + 'bar': [np.nan, 2.5, np.nan, 15.0]}, + columns=['foo', 'bar'], + index=['max', 'mean', + 'min', 'sum']).unstack().rename('series') + tm.assert_series_equal(result.reindex_like(expected), expected) + + def test_agg_apply_evaluate_lambdas_the_same(self): + # test that we are evaluating row-by-row first + # before vectorized evaluation + result = self.series.apply(lambda x: str(x)) + expected = self.series.agg(lambda x: str(x)) + tm.assert_series_equal(result, expected) + + result = self.series.apply(str) + expected = self.series.agg(str) + tm.assert_series_equal(result, expected) + + def test_with_nested_series(self): + # GH 2316 + # .agg with a reducer and a transform, what to do + result = self.ts.apply(lambda x: Series( + [x, x ** 2], index=['x', 'x^2'])) + expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) + tm.assert_frame_equal(result, expected) + + result = self.ts.agg(lambda x: Series( + [x, x ** 2], index=['x', 'x^2'])) + tm.assert_frame_equal(result, expected) + + def test_replicate_describe(self): + # this also tests a result set that is all scalars + expected = self.series.describe() + result = self.series.apply(OrderedDict( + [('count', 'count'), + ('mean', 'mean'), + ('std', 'std'), + ('min', 'min'), + ('25%', lambda x: x.quantile(0.25)), + ('50%', 'median'), + ('75%', lambda x: x.quantile(0.75)), + ('max', 'max')])) + assert_series_equal(result, expected) + + def test_reduce(self): + # reductions with named functions + result = self.series.agg(['sum', 'mean']) + expected = Series([self.series.sum(), + self.series.mean()], + ['sum', 'mean'], + name=self.series.name) + assert_series_equal(result, expected) + + class TestSeriesMap(TestData, tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 26c311b4a72f8..a9468c7e5aff8 100755 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1814,10 +1814,12 @@ def test_how_lambda_functions(self): tm.assert_series_equal(result['foo'], foo_exp) tm.assert_series_equal(result['bar'], bar_exp) + # this is a MI Series, so comparing the names of the results + # doesn't make sense result = ts.resample('M').aggregate({'foo': lambda x: x.mean(), 'bar': lambda x: x.std(ddof=1)}) - tm.assert_series_equal(result['foo'], foo_exp) - tm.assert_series_equal(result['bar'], bar_exp) + tm.assert_series_equal(result['foo'], foo_exp, check_names=False) + tm.assert_series_equal(result['bar'], bar_exp, check_names=False) def test_resample_unequal_times(self): # #1772 diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 6b1c3f9c00351..8d7ba305cef1a 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -44,6 +44,23 @@ def _possibly_convert_platform(values): return values +def _is_nested_object(obj): + """ + return a boolean if we have a nested object, e.g. a Series with 1 or + more Series elements + + This may not be necessarily be performant. + + """ + + if isinstance(obj, ABCSeries) and is_object_dtype(obj): + + if any(isinstance(v, ABCSeries) for v in obj.values): + return True + + return False + + def _possibly_downcast_to_dtype(result, dtype): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 From d843a4ed1553afaffa50c3f916fc99e437282188 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 27 Dec 2016 07:48:58 -0500 Subject: [PATCH 2/2] ENH: weightby closes #10030 --- pandas/core/base.py | 5 +- pandas/core/categorical.py | 2 +- pandas/core/frame.py | 7 +- pandas/core/generic.py | 129 ++++++++++-------- pandas/core/nanops.py | 63 ++++++--- pandas/core/panel.py | 5 +- pandas/core/series.py | 10 +- pandas/core/weightby.py | 239 ++++++++++++++++++++++++++++++++++ pandas/sparse/series.py | 2 +- pandas/tests/test_generic.py | 146 +-------------------- pandas/tests/test_weightby.py | 233 +++++++++++++++++++++++++++++++++ 11 files changed, 616 insertions(+), 225 deletions(-) create mode 100644 pandas/core/weightby.py create mode 100644 pandas/tests/test_weightby.py diff --git a/pandas/core/base.py b/pandas/core/base.py index 6ef866598c56c..5ffdb4bc66234 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -999,12 +999,15 @@ def hasnans(self): return isnull(self).any() def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): """ perform the reduction type operation if we can """ func = getattr(self, name, None) if func is None: raise TypeError("{klass} cannot perform the operation {op}".format( klass=self.__class__.__name__, op=name)) + if weights is not None: + kwds['weights'] = weights + return func(**kwds) def value_counts(self, normalize=False, sort=True, ascending=False, diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 0562736038483..ec3d4510cda5b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1747,7 +1747,7 @@ def _reverse_indexer(self): # reduction ops # def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): """ perform the reduction type operation """ func = getattr(self, name, None) if func is None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0e61d2f6d5702..b791798c0b087 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4946,11 +4946,14 @@ def _count_level(self, level, axis=0, numeric_only=False): else: return result - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce(self, op, name, axis=0, skipna=True, weights=None, + numeric_only=None, filter_type=None, **kwds): axis = self._get_axis_number(axis) def f(x): + if weights is not None: + kwds['weights'] = weights + return op(x, axis=axis, skipna=skipna, **kwds) labels = self._get_agg_axis(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7c6db54a67bb1..1ed8e18ccc740 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2552,7 +2552,7 @@ def tail(self, n=5): return self.iloc[-n:] def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): + random_state=None, axis=None, **kwargs): """ Returns a random sample of items from an axis of object. @@ -2567,7 +2567,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, Fraction of axis items to return. Cannot be used with `n`. replace : boolean, optional Sample with or without replacement. Default = False. - weights : str or ndarray-like, optional + weights : str or ndarray-like, optional [DEPRECATED] Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and @@ -2638,59 +2638,22 @@ def sample(self, n=None, frac=None, replace=False, weights=None, axis = self._stat_axis_number axis = self._get_axis_number(axis) - axis_length = self.shape[axis] # Process random_state argument rs = com._random_state(random_state) - # Check weights for compliance if weights is not None: + from warnings import warn + warn("the weights argument to .sample() is deprecated." + "use {typ}.weightby(weights, axis={axis}).sample(...) " + "instead".format(typ=type(self).__name__, axis=axis), + FutureWarning, stacklevel=2) + return self.weightby(weights, axis=axis).sample( + n=n, frac=frac, + replace=replace, + random_state=random_state) - # If a series, align with frame - if isinstance(weights, pd.Series): - weights = weights.reindex(self.axes[axis]) - - # Strings acceptable if a dataframe and axis = 0 - if isinstance(weights, string_types): - if isinstance(self, pd.DataFrame): - if axis == 0: - try: - weights = self[weights] - except KeyError: - raise KeyError("String passed to weights not a " - "valid column") - else: - raise ValueError("Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame") - else: - raise ValueError("Strings cannot be passed as weights " - "when sampling from a Series or Panel.") - - weights = pd.Series(weights, dtype='float64') - - if len(weights) != axis_length: - raise ValueError("Weights and axis to be sampled must be of " - "same length") - - if (weights == np.inf).any() or (weights == -np.inf).any(): - raise ValueError("weight vector may not include `inf` values") - - if (weights < 0).any(): - raise ValueError("weight vector many not include negative " - "values") - - # If has nan, set to zero. - weights = weights.fillna(0) - - # Renormalize if don't sum to 1 - if weights.sum() != 1: - if weights.sum() != 0: - weights = weights / weights.sum() - else: - raise ValueError("Invalid weights: weights sum to zero") - - weights = weights.values + axis_length = self.shape[axis] # If no frac or n, default to n=1. if n is None and frac is None: @@ -2708,6 +2671,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, raise ValueError("A negative number of rows requested. Please " "provide positive value.") + weights = kwargs.pop('_weights', None) locs = rs.choice(axis_length, size=n, replace=replace, p=weights) return self.take(locs, axis=axis, is_copy=False) @@ -5475,14 +5439,20 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, np.putmask(rs.values, mask, np.nan) return rs - def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + def _agg_by_level(self, name, axis=0, level=0, skipna=True, + weights=None, **kwargs): grouped = self.groupby(level=level, axis=axis) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) axis = self._get_axis_number(axis) method = getattr(type(self), name) - applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs) - return grouped.aggregate(applyf) + + def f(x): + if weights is not None: + kwargs['weights'] = weights + return method(x, axis=axis, skipna=skipna, **kwargs) + + return grouped.aggregate(f) @classmethod def _add_numeric_operations(cls): @@ -5677,6 +5647,50 @@ def transform(self, func, *args, **kwargs): cls.transform = transform + def weightby(self, weights, axis=0): + """ + Provides weighted statistical calculations + + .. versionadded:: 0.20.0 + + Parameters + ---------- + weights : str or ndarray-like + If passed a Series, will align with the target object + on the index. + + Index values in weights that are not found in the target + object will be ignored and index values in the target + object not in the weights will be assigned weights of zero. + + If called on a DataFrame, will accept the name of a column + when axis = 0. + Unless weights are a Series, weights must be same length + as the axis of the target object. + + If weights do not sum to 1, they will be normalized to + sum to 1. Missing values in the weights column will be + treated as zero. + + inf and -inf values not allowed. + + axis : int or string, default 0 + + Returns + ------- + a Weightby lazy object for the particular operation + + Examples + -------- + """ + from pandas.core import weightby + + axis = self._get_axis_number(axis) + return weightby.weightby(self, weights=weights, + axis=axis) + + cls.weightby = weightby + def _doc_parms(cls): """Return a tuple of the doc parms.""" @@ -5781,6 +5795,7 @@ def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + weights = kwargs.pop('_weights', None) nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True @@ -5788,9 +5803,9 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, axis = self._stat_axis_number if level is not None: return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna) + skipna=skipna, weights=weights) return self._reduce(f, name, axis=axis, skipna=skipna, - numeric_only=numeric_only) + weights=weights, numeric_only=numeric_only) return set_function_name(stat_func, name, cls) @@ -5801,6 +5816,7 @@ def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): @Appender(_num_ddof_doc) def stat_func(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs): + weights = kwargs.pop('_weights', None) nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True @@ -5808,9 +5824,10 @@ def stat_func(self, axis=None, skipna=None, level=None, ddof=1, axis = self._stat_axis_number if level is not None: return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna, ddof=ddof) + skipna=skipna, weights=weights, + ddof=ddof) return self._reduce(f, name, axis=axis, numeric_only=numeric_only, - skipna=skipna, ddof=ddof) + weights=weights, skipna=skipna, ddof=ddof) return set_function_name(stat_func, name, cls) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1f76bc850cee9..9b04a6b6710ac 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -71,11 +71,14 @@ def __call__(self, alt): bn_func = None @functools.wraps(alt) - def f(values, axis=None, skipna=True, **kwds): + def f(values, axis=None, skipna=True, weights=None, **kwds): if len(self.kwargs) > 0: for k, v in compat.iteritems(self.kwargs): if k not in kwds: kwds[k] = v + + if weights is not None: + kwds['weights'] = weights try: if self.zero_value is not None and values.size == 0: if values.ndim == 1: @@ -91,7 +94,7 @@ def f(values, axis=None, skipna=True, **kwds): result.fill(0) return result - if (_USE_BOTTLENECK and skipna and + if (_USE_BOTTLENECK and skipna and weights is None and _bn_ok_dtype(values.dtype, bn_name)): result = bn_func(values, axis=axis, **kwds) @@ -101,7 +104,8 @@ def f(values, axis=None, skipna=True, **kwds): result = alt(values, axis=axis, skipna=skipna, **kwds) else: result = alt(values, axis=axis, skipna=skipna, **kwds) - except Exception: + except Exception as e: + try: result = alt(values, axis=axis, skipna=skipna, **kwds) except ValueError as e: @@ -169,11 +173,29 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): return tslib.iNaT -def _get_values(values, skipna, fill_value=None, fill_value_typ=None, - isfinite=False, copy=True): - """ utility to get the values view, mask, dtype +def _get_values(values, skipna, + fill_value=None, fill_value_typ=None, + isfinite=False, weights=None, axis=None, + copy=True): + """ + utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value - copy = True will force the copy + and adjust for weights + + Parameters + ---------- + values : ndarray + skipna : boolean + fill_value : value, default None + value to fillna + fill_value_typ : value, default None + dtype of the fillvalue + isfinite : boolean, default False + weights : ndarray, optional + normalized ndarray, same length as the axis + axis : axis to broadcast, default None + copy : boolean, default True + True will force the copy """ values = _values_from_object(values) if isfinite: @@ -181,6 +203,10 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None, else: mask = isnull(values) + # weights + if weights is not None: + values = values * weights.reshape(values.shape) + dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) @@ -267,13 +293,16 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch(zero_value=0) -def nansum(values, axis=None, skipna=True): - values, mask, dtype, dtype_max = _get_values(values, skipna, 0) +def nansum(values, axis=None, skipna=True, weights=None): + values, mask, dtype, dtype_max = _get_values(values, skipna, + 0, weights=weights, + axis=axis) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 + the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask) @@ -282,8 +311,10 @@ def nansum(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nanmean(values, axis=None, skipna=True): - values, mask, dtype, dtype_max = _get_values(values, skipna, 0) +def nanmean(values, axis=None, skipna=True, weights=None): + values, mask, dtype, dtype_max = _get_values(values, skipna, + 0, weights=weights, + axis=axis) dtype_sum = dtype_max dtype_count = np.float64 @@ -368,14 +399,14 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): @disallow('M8') @bottleneck_switch(ddof=1) -def nanstd(values, axis=None, skipna=True, ddof=1): +def nanstd(values, axis=None, skipna=True, ddof=1, weights=None): result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) return _wrap_results(result, values.dtype) @disallow('M8') @bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1): +def nanvar(values, axis=None, skipna=True, ddof=1, weights=None): dtype = values.dtype mask = isnull(values) @@ -414,7 +445,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1): @disallow('M8', 'm8') -def nansem(values, axis=None, skipna=True, ddof=1): +def nansem(values, axis=None, skipna=True, ddof=1, weights=None): var = nanvar(values, axis, skipna, ddof=ddof) mask = isnull(values) @@ -476,7 +507,7 @@ def nanargmin(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanskew(values, axis=None, skipna=True): +def nanskew(values, axis=None, skipna=True, weights=None): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized @@ -531,7 +562,7 @@ def nanskew(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nankurt(values, axis=None, skipna=True): +def nankurt(values, axis=None, skipna=True, weights=None): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized diff --git a/pandas/core/panel.py b/pandas/core/panel.py index f708774dd84ff..8685b51083859 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1101,10 +1101,13 @@ def _apply_2d(self, func, axis): return self._construct_return_type(dict(results)) def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): if numeric_only: raise NotImplementedError('Panel.{0} does not implement ' 'numeric_only.'.format(name)) + if weights is not None: + raise NotImplementedError('Panel.{0} does not implement ' + 'weights.'.format(name)) axis_name = self._get_axis_name(axis) axis_number = self._get_axis_number(axis_name) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2a15aa86201a2..17b7382d6e761 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2354,8 +2354,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): return self._constructor(mapped, index=self.index).__finalize__(self) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce(self, op, name, axis=0, skipna=True, weights=None, + numeric_only=None, filter_type=None, **kwds): """ perform a reduction operation @@ -2370,11 +2370,15 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) + + if weights is not None: + kwds['weights'] = weights + with np.errstate(all='ignore'): return op(delegate, skipna=skipna, **kwds) return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, - numeric_only=numeric_only, + weights=weights, numeric_only=numeric_only, filter_type=filter_type, **kwds) def _reindex_indexer(self, new_index, indexer, copy): diff --git a/pandas/core/weightby.py b/pandas/core/weightby.py new file mode 100644 index 0000000000000..92a4e24e09afc --- /dev/null +++ b/pandas/core/weightby.py @@ -0,0 +1,239 @@ +""" + +provide a lazy structure to support +weights for calculation +similar to how we have a Groupby object + + +""" + +import numpy as np +import pandas as pd + +from pandas import Series, DataFrame +from pandas.compat import string_types, set_function_name +from pandas.types.generic import ABCSeries, ABCDataFrame +from pandas.types.common import is_scalar, is_list_like +from pandas.core.base import PandasObject, SelectionMixin +from pandas.util.decorators import Appender, Substitution + + +class Weightby(PandasObject, SelectionMixin): + _attributes = ['weights', 'axis'] + + def __init__(self, obj, weights=None, axis=0): + + self.exclusions = set() + self._weights = None + self.weights = weights + self.axis = axis + self.obj = obj + + def _gotitem(self, key, ndim, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + + # create a new object to prevent aliasing + if subset is None: + subset = self.obj + + newself = self._shallow_copy(subset, obj_type=type(self)) + newself._reset_cache() + if subset.ndim == 2: + if is_scalar(key) and key in subset or is_list_like(key): + newself._selection = key + return newself + + def __getattr__(self, attr): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self.obj: + return self[attr] + + raise AttributeError("%r object has no attribute %r" % + (type(self).__name__, attr)) + + def _compute_weights(self): + """ + compute our _weights + """ + if self._weights is not None: + return self._weights + + obj = self._selected_obj + + weights = self.weights + axis = self.axis + + # If a series, align with frame + if isinstance(weights, Series): + weights = weights.reindex(obj.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, string_types): + + # we use self.obj as we may have a selection here + if isinstance(self.obj, pd.DataFrame): + if axis == 0: + try: + + # exclude this as an aggregator + self.exclusions.add(weights) + + weights = self.obj[weights] + + except KeyError: + raise KeyError("String passed to weights is not a " + "valid column") + else: + raise ValueError("Strings can only be passed to " + "weights when weighting by the rows on " + "a DataFrame") + else: + raise ValueError("Strings cannot be passed as weights " + "when weighting from a Series or Panel.") + + weights = Series(weights, dtype='float64') + + if len(weights) != len(obj.axes[axis]): + raise ValueError("Weights and axis to be must be of " + "same length") + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative " + "values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + if weights.sum() != 0: + weights = weights / weights.sum() + else: + raise ValueError("Invalid weights: weights sum to zero") + + self._weights = weights.values + return self._weights + + def _apply(self, func, *args, **kwargs): + """ + Apply the function with weights + + Parameters + ---------- + func : string/callable to apply + + Returns + ------- + y : type of input + """ + + weights = self._compute_weights() + + # we may need to drop the dim + # before operations + obj = self._obj_with_exclusions + if self._selection is not None: + obj = obj[self._selection] + + f = getattr(obj, func) + + kwargs['axis'] = self.axis + kwargs['_weights'] = weights + + result = f(*args, **kwargs) + result = self._wrap_results(result) + return result + + def _wrap_results(self, result): + return result + + +class SeriesWeightBy(Weightby): + + @property + def _constructor(self): + return Series + + @Substitution(name='weightby') + @Appender(SelectionMixin._see_also_template) + @Appender(SelectionMixin._agg_doc) + def aggregate(self, arg, *args, **kwargs): + return super(SeriesWeightBy, self).aggregate(arg, *args, **kwargs) + + agg = aggregate + + @Appender(Series.sample.__doc__) + def sample(self, n=None, frac=None, replace=False, + random_state=None): + return self._apply('sample', n=n, frac=frac, replace=replace, + random_state=random_state) + + +class DataFrameWeightBy(Weightby): + + @property + def _constructor(self): + return DataFrame + + @Substitution(name='weightby') + @Appender(SelectionMixin._see_also_template) + @Appender(SelectionMixin._agg_doc) + def aggregate(self, arg, *args, **kwargs): + return super(DataFrameWeightBy, self).aggregate(arg, *args, **kwargs) + + agg = aggregate + + @Appender(DataFrame.sample.__doc__) + def sample(self, n=None, frac=None, replace=False, + random_state=None): + return self._apply('sample', n=n, frac=frac, replace=replace, + random_state=random_state) + + +def _add_stat_function(cls, ref_obj, name): + + @Appender(getattr(ref_obj, name).__doc__) + def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, + **kwargs): + return self._apply(name, axis=axis, skipna=skipna, level=level, + numeric_only=numeric_only, **kwargs) + + setattr(cls, name, set_function_name(stat_func, name, cls)) + + +# add in stat methods +for method in ['sum', 'mean', 'std', 'var', + 'sem', 'kurt', 'skew', 'sem']: + + _add_stat_function(SeriesWeightBy, Series, method) + _add_stat_function(DataFrameWeightBy, DataFrame, method) + + +# Top-level exports +def weightby(obj, *args, **kwds): + if isinstance(obj, ABCSeries): + klass = SeriesWeightBy + elif isinstance(obj, ABCDataFrame): + klass = DataFrameWeightBy + else: + raise TypeError('invalid type: %s' % type(obj)) + + return klass(obj, *args, **kwds) + + +weightby.__doc__ = Weightby.__doc__ diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index d6bc892921c42..d2842ae80088a 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -325,7 +325,7 @@ def __array_finalize__(self, obj): self.fill_value = getattr(obj, 'fill_value', None) def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): """ perform a reduction operation """ return op(self.get_values(), skipna=skipna, **kwds) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3500ce913462a..299ce80243d3a 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -449,60 +449,6 @@ def test_sample(self): self.assertTrue(len(o.sample(frac=0.34) == 3)) self.assertTrue(len(o.sample(frac=0.36) == 4)) - ### - # Check weights - ### - - # Weight length must be right - with tm.assertRaises(ValueError): - o.sample(n=3, weights=[0, 1]) - - with tm.assertRaises(ValueError): - bad_weights = [0.5] * 11 - o.sample(n=3, weights=bad_weights) - - with tm.assertRaises(ValueError): - bad_weight_series = Series([0, 0, 0.2]) - o.sample(n=4, weights=bad_weight_series) - - # Check won't accept negative weights - with tm.assertRaises(ValueError): - bad_weights = [-0.1] * 10 - o.sample(n=3, weights=bad_weights) - - # Check inf and -inf throw errors: - with tm.assertRaises(ValueError): - weights_with_inf = [0.1] * 10 - weights_with_inf[0] = np.inf - o.sample(n=3, weights=weights_with_inf) - - with tm.assertRaises(ValueError): - weights_with_ninf = [0.1] * 10 - weights_with_ninf[0] = -np.inf - o.sample(n=3, weights=weights_with_ninf) - - # All zeros raises errors - zero_weights = [0] * 10 - with tm.assertRaises(ValueError): - o.sample(n=3, weights=zero_weights) - - # All missing weights - nan_weights = [np.nan] * 10 - with tm.assertRaises(ValueError): - o.sample(n=3, weights=nan_weights) - - # Check np.nan are replaced by zeros. - weights_with_nan = [np.nan] * 10 - weights_with_nan[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) - - # Check None are also replaced by zeros. - weights_with_None = [None] * 10 - weights_with_None[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) - def test_size_compat(self): # GH8846 # size property should be defined @@ -1580,66 +1526,13 @@ class TestNDFrame(tm.TestCase): # tests that don't fit elsewhere def test_sample(sel): - # Fixes issue: 2419 - # additional specific object based tests - - # A few dataframe test with degenerate weights. - easy_weight_list = [0] * 10 - easy_weight_list[5] = 1 + # all weight testing happens in test_weightby.py df = pd.DataFrame({'col1': range(10, 20), 'col2': range(20, 30), - 'colString': ['a'] * 10, - 'easyweights': easy_weight_list}) - sample1 = df.sample(n=1, weights='easyweights') - assert_frame_equal(sample1, df.iloc[5:6]) - - # Ensure proper error if string given as weight for Series, panel, or - # DataFrame with axis = 1. - s = Series(range(10)) - with tm.assertRaises(ValueError): - s.sample(n=3, weights='weight_column') - - panel = pd.Panel(items=[0, 1, 2], major_axis=[2, 3, 4], - minor_axis=[3, 4, 5]) - with tm.assertRaises(ValueError): - panel.sample(n=1, weights='weight_column') - - with tm.assertRaises(ValueError): - df.sample(n=1, weights='weight_column', axis=1) + 'colString': ['a'] * 10}) # Check weighting key error - with tm.assertRaises(KeyError): - df.sample(n=3, weights='not_a_real_column_name') - - # Check that re-normalizes weights that don't sum to one. - weights_less_than_1 = [0] * 10 - weights_less_than_1[0] = 0.5 - tm.assert_frame_equal( - df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) - - ### - # Test axis argument - ### - - # Test axis argument - df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) - second_column_weight = [0, 1] - assert_frame_equal( - df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) - - # Different axis arg types - assert_frame_equal(df.sample(n=1, axis='columns', - weights=second_column_weight), - df[['col2']]) - - weight = [0] * 10 - weight[5] = 0.5 - assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), - df.iloc[5:6]) - assert_frame_equal(df.sample(n=1, axis='index', weights=weight), - df.iloc[5:6]) - # Check out of range axis values with tm.assertRaises(ValueError): df.sample(n=1, axis=2) @@ -1651,20 +1544,6 @@ def test_sample(sel): s = pd.Series(range(10)) s.sample(n=1, axis=1) - # Test weight length compared to correct axis - with tm.assertRaises(ValueError): - df.sample(n=1, axis=1, weights=[0.5] * 10) - - # Check weights with axis = 1 - easy_weight_list = [0] * 3 - easy_weight_list[2] = 1 - - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10}) - sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) - assert_frame_equal(sample1, df[['colString']]) - # Test default axes p = pd.Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6], minor_axis=[1, 3, 5]) @@ -1675,27 +1554,6 @@ def test_sample(sel): df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)) - # Test that function aligns weights with frame - df = DataFrame( - {'col1': [5, 6, 7], - 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) - s = Series([1, 0, 0], index=[3, 5, 9]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) - - # Weights have index values to be dropped because not in - # sampled DataFrame - s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) - - # Weights have empty values to be filed with zeros - s3 = Series([0.01, 0], index=[3, 5]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) - - # No overlap in weight and sampled DataFrame indices - s4 = Series([1, 0], index=[1, 2]) - with tm.assertRaises(ValueError): - df.sample(1, weights=s4) - def test_squeeze(self): # noop for s in [tm.makeFloatSeries(), tm.makeStringSeries(), diff --git a/pandas/tests/test_weightby.py b/pandas/tests/test_weightby.py new file mode 100644 index 0000000000000..fedbabfa0a91c --- /dev/null +++ b/pandas/tests/test_weightby.py @@ -0,0 +1,233 @@ +import numpy as np +import pandas as pd + +from pandas import DataFrame, Series +from pandas.util import testing as tm +from pandas.core import common as com + + +class TestWeights(tm.TestCase): + + def setUp(self): + self.df = DataFrame({'A': [0.25, 0.25, 0.25, 0.25], + 'B': [1, 2, 3, 4]}) + self.df2 = DataFrame({'A': [1, 2, 3, 4], + 'B': [1, 2, 3, 4]}) + + def test_basic(self): + + for f in ['sum', 'mean']: + weights = self.df[['A']] / self.df.A.sum() + result = getattr(self.df.weightby('A'), f)() + expected = getattr(self.df[['B']] * weights.values, f)() + tm.assert_series_equal(result, expected) + + weights2 = self.df2[['A']] / self.df2.A.sum() + result = getattr(self.df2.weightby('A'), f)() + expected = getattr(self.df2[['B']] * weights2.values, f)() + tm.assert_series_equal(result, expected) + + for f in ['kurt', 'skew', 'sem']: + weights = self.df[['A']] / self.df.A.sum() + result = getattr(self.df.weightby('A'), f)() + expected = getattr(self.df[['B']] * weights.values, f)() + # tm.assert_series_equal(result, expected) + + weights2 = self.df2[['A']] / self.df2.A.sum() + result = getattr(self.df2.weightby('A'), f)() + expected = getattr(self.df2[['B']] * weights2.values, f)() + # tm.assert_series_equal(result, expected) + + for f in ['std', 'var']: + + weights = self.df[['A']] / self.df.A.sum() + result = getattr(self.df.weightby('A'), f)(ddof=2) + expected = getattr(self.df[['B']] * weights.values, f)(ddof=2) + # tm.assert_series_equal(result, expected) + + weights2 = self.df2[['A']] / self.df2.A.sum() + result = getattr(self.df2.weightby('A'), f)(ddof=2) + expected = getattr(self.df2[['B']] * weights2.values, f)(ddof=2) + # tm.assert_series_equal(result, expected) + + def test_gotitem(self): + + result = self.df.weightby('A')['B'].sum() + expected = self.df.weightby('A').sum()['B'] + self.assertEqual(result, expected) + + result = self.df.weightby('A').B.sum() + self.assertEqual(result, expected) + + result = self.df['B'].weightby(self.df['A']).sum() + self.assertEqual(result, expected) + + def test_sample_deprecation(self): + rs = com._random_state(1234) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = self.df.sample(2, random_state=rs, weights='A') + + expected = self.df.iloc[[0, 2]][['B']] + tm.assert_frame_equal(result, expected) + + def test_unsupported(self): + for f in ['first', 'median', 'min', 'max', 'prod']: + + def func(): + getattr(self.df.weightby('A'), f)() + self.assertRaises(AttributeError, func) + + def test_panel_unsupported(self): + panel = pd.Panel(items=[0, 1, 2], major_axis=[2, 3, 4], + minor_axis=[3, 4, 5]) + with tm.assertRaises(AttributeError): + panel.weightby('weight_column') + + def test_weights_validation(self): + o = DataFrame(np.random.randn(10, 10)) + + # Weight length must be right + with tm.assertRaises(ValueError): + o.weightby([0, 1]).sample(n=3) + + with tm.assertRaises(ValueError): + bad_weights = [0.5] * 11 + o.weightby(bad_weights).sample(n=3) + + with tm.assertRaises(ValueError): + bad_weight_series = Series([0, 0, 0.2]) + o.weightby(bad_weight_series).sample(n=4) + + # Check won't accept negative weights + with tm.assertRaises(ValueError): + bad_weights = [-0.1] * 10 + o.weightby(bad_weights).sample(n=3) + + # Check inf and -inf throw errors: + with tm.assertRaises(ValueError): + weights_with_inf = [0.1] * 10 + weights_with_inf[0] = np.inf + o.weightby(weights_with_inf).sample(n=3) + + with tm.assertRaises(ValueError): + weights_with_ninf = [0.1] * 10 + weights_with_ninf[0] = -np.inf + o.weightby(weights_with_ninf).sample(n=3) + + # All zeros raises errors + zero_weights = [0] * 10 + with tm.assertRaises(ValueError): + o.weightby(zero_weights).sample(n=3) + + # All missing weights + nan_weights = [np.nan] * 10 + with tm.assertRaises(ValueError): + o.weightby(nan_weights).sample(n=3) + + # Check np.nan are replaced by zeros. + weights_with_nan = [np.nan] * 10 + weights_with_nan[5] = 0.5 + tm.assert_frame_equal( + o.weightby(weights_with_nan, axis=0).sample(n=1), o.iloc[5:6]) + + # Check None are also replaced by zeros. + weights_with_None = [None] * 10 + weights_with_None[5] = 0.5 + tm.assert_frame_equal( + o.weightby(weights_with_None, axis=0).sample(n=1), o.iloc[5:6]) + + def test_weights_strings(sel): + # Fixes issue: 2419 + # additional specific object based tests + + # A few dataframe test with degenerate weights. + easy_weight_list = [0] * 10 + easy_weight_list[5] = 1 + + df = pd.DataFrame({'col1': range(10, 20), + 'col2': range(20, 30), + 'colString': ['a'] * 10, + 'easyweights': easy_weight_list}) + result = df.weightby('easyweights').sample(n=1) + expected = df.iloc[5:6, 0:-1] + tm.assert_frame_equal(result, expected) + + # Ensure proper error if string given as weight for Series, panel, or + # DataFrame with axis = 1. + s = Series(range(10)) + with tm.assertRaises(ValueError): + s.weightby('weight_column').sample(n=3) + + with tm.assertRaises(ValueError): + df.weightby('weight_column', axis=1).sample(n=1) + + # Check weighting key error + with tm.assertRaises(KeyError): + df.weightby('not_a_real_column_name').sample(n=3) + + # Check that re-normalizes weights that don't sum to one. + weights_less_than_1 = [0] * 10 + weights_less_than_1[0] = 0.5 + tm.assert_frame_equal( + df.weightby(weights_less_than_1).sample(n=1), df.iloc[:1]) + + def test_weights_axis(sel): + + # Test axis argument + df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) + second_column_weight = [0, 1] + result = df.weightby(second_column_weight, axis=1).sample(n=1) + tm.assert_frame_equal(result, df[['col2']]) + + # Different axis arg types + result = df.weightby(second_column_weight, axis='columns').sample(n=1) + tm.assert_frame_equal(result, df[['col2']]) + + weight = [0] * 10 + weight[5] = 0.5 + tm.assert_frame_equal(df.weightby(weight, axis='index').sample(n=1), + df.iloc[5:6]) + + # Test weight length compared to correct axis + with tm.assertRaises(ValueError): + df.weightby([0.5] * 10, axis=1).sample(n=1) + + # Check weights with axis = 1 + easy_weight_list = [0] * 3 + easy_weight_list[2] = 1 + + df = pd.DataFrame({'col1': range(10, 20), + 'col2': range(20, 30), + 'colString': ['a'] * 10}) + result = df.weightby(easy_weight_list, axis=1).sample(n=1) + tm.assert_frame_equal(result, df[['colString']]) + + # Test that function aligns weights with frame + df = DataFrame( + {'col1': [5, 6, 7], + 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) + s = Series([1, 0, 0], index=[3, 5, 9]) + result = df.weightby(s).sample(1) + tm.assert_frame_equal(result, df.loc[[3]]) + + # Weights have index values to be dropped because not in + # sampled DataFrame + s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) + result = df.weightby(s2).sample(1) + tm.assert_frame_equal(result, df.loc[[3]]) + + # Weights have empty values to be filed with zeros + s3 = Series([0.01, 0], index=[3, 5]) + result = df.weightby(s3).sample(1) + tm.assert_frame_equal(result, df.loc[[3]]) + + # No overlap in weight and sampled DataFrame indices + s4 = Series([1, 0], index=[1, 2]) + with tm.assertRaises(ValueError): + df.weightby(s4).sample(1) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False)