diff --git a/README.md b/README.md
index 86cf95508a5d9..78e9b93ae535f 100644
--- a/README.md
+++ b/README.md
@@ -9,18 +9,33 @@
Latest Release |
-  |
+
+
+
+
+ |
|
-  |
+
+
+
+
+ |
Package Status |
-  |
+
+
+  |
+
License |
-  |
+
+
+
+
+ |
Build Status |
@@ -48,18 +63,14 @@
Coverage |
-  |
-
-
- Conda |
-
-
-
+ |
+
+
|
- Conda-forge |
+ Downloads |
@@ -67,16 +78,16 @@
|
- PyPI |
-
-
-
-
- |
+ Gitter |
+
+
+
+ |
-[](https://gitter.im/pydata/pandas?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+
## What is it
diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml
index 1337fc54e9aac..f9f9208519d61 100644
--- a/ci/environment-dev.yaml
+++ b/ci/environment-dev.yaml
@@ -11,5 +11,5 @@ dependencies:
- python-dateutil>=2.5.0
- python=3
- pytz
- - setuptools>=3.3
+ - setuptools>=24.2.0
- sphinx
diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt
index fcbe0da5de305..3430e778a4573 100644
--- a/ci/requirements_dev.txt
+++ b/ci/requirements_dev.txt
@@ -7,5 +7,5 @@ moto
pytest>=3.1
python-dateutil>=2.5.0
pytz
-setuptools>=3.3
-sphinx
\ No newline at end of file
+setuptools>=24.2.0
+sphinx
diff --git a/doc/source/api.rst b/doc/source/api.rst
index e224e9927f55c..e43632ea46bfb 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -2106,6 +2106,7 @@ Standard moving window functions
Rolling.skew
Rolling.kurt
Rolling.apply
+ Rolling.aggregate
Rolling.quantile
Window.mean
Window.sum
@@ -2133,6 +2134,7 @@ Standard expanding window functions
Expanding.skew
Expanding.kurt
Expanding.apply
+ Expanding.aggregate
Expanding.quantile
Exponentially-weighted moving window functions
diff --git a/doc/source/computation.rst b/doc/source/computation.rst
index 4285767654e25..ff06c369e1897 100644
--- a/doc/source/computation.rst
+++ b/doc/source/computation.rst
@@ -323,7 +323,7 @@ compute the mean absolute deviation on a rolling basis:
mad = lambda x: np.fabs(x - x.mean()).mean()
@savefig rolling_apply_ex.png
- s.rolling(window=60).apply(mad).plot(style='k')
+ s.rolling(window=60).apply(mad, raw=True).plot(style='k')
.. _stats.rolling_window:
diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst
index 4e61228d5c0ad..893642410af02 100644
--- a/doc/source/cookbook.rst
+++ b/doc/source/cookbook.rst
@@ -496,7 +496,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
def Red(x):
return functools.reduce(CumRet,x,1.0)
- S.expanding().apply(Red)
+ S.expanding().apply(Red, raw=True)
`Replacing some values with mean of the rest of a group
diff --git a/doc/source/extending.rst b/doc/source/extending.rst
index 25c4ba4a4a2a3..b94a43480ed93 100644
--- a/doc/source/extending.rst
+++ b/doc/source/extending.rst
@@ -57,6 +57,13 @@ If you write a custom accessor, make a pull request adding it to our
Extension Types
---------------
+.. versionadded:: 0.23.0
+
+.. warning::
+
+ The ``ExtensionDtype`` and ``ExtensionArray`` APIs are new and
+ experimental. They may change between versions without warning.
+
Pandas defines an interface for implementing data types and arrays that *extend*
NumPy's type system. Pandas itself uses the extension system for some types
that aren't built into NumPy (categorical, period, interval, datetime with
@@ -106,6 +113,24 @@ by some other storage type, like Python lists.
See the `extension array source`_ for the interface definition. The docstrings
and comments contain guidance for properly implementing the interface.
+We provide a test suite for ensuring that your extension arrays satisfy the expected
+behavior. To use the test suite, you must provide several pytest fixtures and inherit
+from the base test class. The required fixtures are found in
+https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/conftest.py.
+
+To use a test, subclass it:
+
+.. code-block:: python
+
+ from pandas.tests.extension import base
+
+ class TestConstructors(base.BaseConstructorsTests):
+ pass
+
+
+See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py
+for a list of all the tests available.
+
.. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py
.. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py
diff --git a/doc/source/install.rst b/doc/source/install.rst
index fdb22a8dc3380..4713bbb78d633 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -15,7 +15,7 @@ Instructions for installing from source,
`PyPI `__, `ActivePython `__, various Linux distributions, or a
`development version `__ are also provided.
-.. _install.dropping_27:
+.. _install.dropping-27:
Plan for dropping Python 2.7
----------------------------
@@ -223,7 +223,7 @@ installed), make sure you have `pytest
Dependencies
------------
-* `setuptools `__: 3.3.0 or higher
+* `setuptools `__: 24.2.0 or higher
* `NumPy `__: 1.9.0 or higher
* `python-dateutil /https://dateutil.readthedocs.io/en/stable/>`__: 2.5.0 or higher
* `pytz `__
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index 18c4dca5b69da..c5877f10420dc 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -11,7 +11,7 @@ version.
.. warning::
Starting January 1, 2019, pandas feature releases will support Python 3 only.
- See :ref:`here ` for more.
+ See :ref:`install.dropping-27` for more.
.. _whatsnew_0230.enhancements:
@@ -65,6 +65,35 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp
pd.get_dummies(df, columns=['c'], dtype=bool).dtypes
+.. _whatsnew_0230.enhancements.window_raw:
+
+Rolling/Expanding.apply() accepts a ``raw`` keyword to pass a ``Series`` to the function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `,
+:func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have gained a ``raw=None`` parameter.
+This is similar to :func:`DataFame.apply`. This parameter, if ``True`` allows one to send a ``np.ndarray`` to the applied function. If ``False`` a ``Series`` will be passed. The
+default is ``None``, which preserves backward compatibility, so this will default to ``True``, sending an ``np.ndarray``.
+In a future version the default will be changed to ``False``, sending a ``Series``. (:issue:`5071`, :issue:`20584`)
+
+.. ipython:: python
+
+ s = pd.Series(np.arange(5), np.arange(5) + 1)
+ s
+
+Pass a ``Series``:
+
+.. ipython:: python
+
+ s.rolling(2, min_periods=1).apply(lambda x: x.iloc[-1], raw=False)
+
+Mimic the original behavior of passing a ndarray:
+
+.. ipython:: python
+
+ s.rolling(2, min_periods=1).apply(lambda x: x[-1], raw=True)
+
+
.. _whatsnew_0230.enhancements.merge_on_columns_and_levels:
Merging on a combination of columns and index levels
@@ -192,6 +221,12 @@ Current Behavior:
s.rank(na_option='top')
+These bugs were squashed:
+
+- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`)
+- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``ascending='False'`` failed to return correct ranks for infinity if ``NaN`` were present (:issue:`19538`)
+- Bug in :func:`DataFrameGroupBy.rank` where ranks were incorrect when both infinity and ``NaN`` were present (:issue:`20561`)
+
.. _whatsnew_0230.enhancements.round-trippable_json:
JSON read/write round-trippable with ``orient='table'``
@@ -306,8 +341,8 @@ Supplying a ``CategoricalDtype`` will make the categories in each column consist
.. _whatsnew_023.enhancements.extension:
-Extending Pandas with Custom Types
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Extending Pandas with Custom Types (Experimental)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pandas now supports storing array-like objects that aren't necessarily 1-D NumPy
arrays as columns in a DataFrame or values in a Series. This allows third-party
@@ -380,6 +415,7 @@ Other Enhancements
- :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`)
- :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`)
- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`)
+- :meth:`DataFrame.append` can now in more cases preserve the type of the calling dataframe's columns (e.g. if both are ``CategoricalIndex``) (:issue:`18359`)
- :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`)
- ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`)
- ``Categorical.rename_categories``, ``CategoricalIndex.rename_categories`` and :attr:`Series.cat.rename_categories`
@@ -408,6 +444,7 @@ Other Enhancements
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
+- :class:`WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`).
- :class:`DataFrame` and :class:`Series` now support matrix multiplication (```@```) operator (:issue:`10259`) for Python>=3.5
- Updated ``to_gbq`` and ``read_gbq`` signature and documentation to reflect changes from
the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ
@@ -435,6 +472,8 @@ If installed, we now require:
+-----------------+-----------------+----------+---------------+
| beautifulsoup4 | 4.2.1 | | :issue:`20082`|
+-----------------+-----------------+----------+---------------+
+| setuptools | 24.2.0 | | :issue:`20698`|
++-----------------+-----------------+----------+---------------+
.. _whatsnew_0230.api_breaking.dict_insertion_order:
@@ -815,6 +854,7 @@ Other API Changes
- :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`)
- Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`).
- :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`)
+- A user-defined-function that is passed to :func:`Series.rolling().aggregate() `, :func:`DataFrame.rolling().aggregate() `, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`)
.. _whatsnew_0230.deprecations:
@@ -843,6 +883,9 @@ Deprecations
- ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`)
- ``NDFrame.get_ftype_counts()`` is deprecated and will be removed in a future version (:issue:`18243`)
- The ``convert_datetime64`` parameter in :func:`DataFrame.to_records` has been deprecated and will be removed in a future version. The NumPy bug motivating this parameter has been resolved. The default value for this parameter has also changed from ``True`` to ``None`` (:issue:`18160`).
+- :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `,
+ :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`)
+- ``DatetimeIndex.offset`` is deprecated. Use ``DatetimeIndex.freq`` instead (:issue:`20716`)
.. _whatsnew_0230.prior_deprecations:
@@ -1045,14 +1088,12 @@ Offsets
Numeric
^^^^^^^
-- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`)
- Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`)
- Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)
- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`)
- Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`)
- Bug in :class:`DataFrame` flex arithmetic (e.g. ``df.add(other, fill_value=foo)``) with a ``fill_value`` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`)
- Multiplication and division of numeric-dtyped :class:`Index` objects with timedelta-like scalars returns ``TimedeltaIndex`` instead of raising ``TypeError`` (:issue:`19333`)
-- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``ascending='False'`` failed to return correct ranks for infinity if ``NaN`` were present (:issue:`19538`)
- Bug where ``NaN`` was returned instead of 0 by :func:`Series.pct_change` and :func:`DataFrame.pct_change` when ``fill_method`` is not ``None`` (:issue:`19873`)
@@ -1077,6 +1118,8 @@ Indexing
- Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` in presence of entire rows of NaNs in the middle of values (:issue:`20499`).
- Bug in :class:`IntervalIndex` where some indexing operations were not supported for overlapping or non-monotonic ``uint64`` data (:issue:`20636`)
- Bug in ``Series.is_unique`` where extraneous output in stderr is shown if Series contains objects with ``__ne__`` defined (:issue:`20661`)
+- Bug in ``.loc`` assignment with a single-element list-like incorrectly assigns as a list (:issue:`19474`)
+- Bug in partial string indexing on a ``Series/DataFrame`` with a monotonic decreasing ``DatetimeIndex`` (:issue:`19362`)
MultiIndex
^^^^^^^^^^
@@ -1114,6 +1157,7 @@ I/O
- Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`)
- Bug in ``usecols`` parameter in :func:`read_csv` where error is not raised correctly when passing a string. (:issue:`20529`)
- Bug in :func:`HDFStore.keys` when reading a file with a softlink causes exception (:issue:`20523`)
+- Bug in :func:`HDFStore.select_column` where a key which is not a valid store raised an ``AttributeError`` instead of a ``KeyError`` (:issue:`17912`)
Plotting
^^^^^^^^
@@ -1177,6 +1221,7 @@ Reshaping
- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
- Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
+- Improved error message when the number of levels in a pivot table or an unstacked dataframe is too large causing int32 overflow (:issue:`20601`)
Other
^^^^^
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index de802f4a72277..6a33e4a09476d 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -417,7 +417,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] labels,
bint is_datetimelike, object ties_method,
bint ascending, bint pct, object na_option):
- """Provides the rank of values within each group
+ """
+ Provides the rank of values within each group.
Parameters
----------
@@ -425,17 +426,24 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
values : array of {{c_type}} values to be ranked
labels : array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`
- is_datetimelike : bool
+ is_datetimelike : bool, default False
unused in this method but provided for call compatibility with other
Cython transformations
- ties_method : {'keep', 'top', 'bottom'}
+ ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+ * average: average rank of group
+ * min: lowest rank in group
+ * max: highest rank in group
+ * first: ranks assigned in order they appear in the array
+ * dense: like 'min', but rank always increases by 1 between groups
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ pct : boolean, default False
+ Compute percentage rank of data within each group
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
* keep: leave NA values where they are
* top: smallest rank if ascending
* bottom: smallest rank if descending
- ascending : boolean
- False for ranks by high (1) to low (N)
- pct : boolean
- Compute percentage rank of data within each group
Notes
-----
@@ -508,7 +516,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
# if keep_na, check for missing values and assign back
# to the result where appropriate
- if keep_na and masked_vals[_as[i]] == nan_fill_val:
+
+ if keep_na and mask[_as[i]]:
grp_na_count += 1
out[_as[i], 0] = nan
else:
@@ -548,9 +557,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
# reset the dups and sum_ranks, knowing that a new value is coming
# up. the conditional also needs to handle nan equality and the
# end of iteration
- if (i == N - 1 or (
- (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not
- (mask[_as[i]] and mask[_as[i+1]]))):
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]])):
dups = sum_ranks = 0
val_start = i
grp_vals_seen += 1
diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx
index aa13f03d8e9e4..e524f823605a4 100644
--- a/pandas/_libs/window.pyx
+++ b/pandas/_libs/window.pyx
@@ -1432,30 +1432,35 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win,
return output
-def roll_generic(ndarray[float64_t, cast=True] input,
+def roll_generic(object obj,
int64_t win, int64_t minp, object index, object closed,
- int offset, object func,
+ int offset, object func, bint raw,
object args, object kwargs):
cdef:
ndarray[double_t] output, counts, bufarr
+ ndarray[float64_t, cast=True] arr
float64_t *buf
float64_t *oldbuf
int64_t nobs = 0, i, j, s, e, N
bint is_variable
ndarray[int64_t] start, end
- if not input.flags.c_contiguous:
- input = input.copy('C')
-
- n = len(input)
+ n = len(obj)
if n == 0:
- return input
+ return obj
+
+ arr = np.asarray(obj)
+
+ # ndarray input
+ if raw:
+ if not arr.flags.c_contiguous:
+ arr = arr.copy('C')
- counts = roll_sum(np.concatenate([np.isfinite(input).astype(float),
+ counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float),
np.array([0.] * offset)]),
win, minp, index, closed)[offset:]
- start, end, N, win, minp, is_variable = get_window_indexer(input, win,
+ start, end, N, win, minp, is_variable = get_window_indexer(arr, win,
minp, index,
closed,
floor=0)
@@ -1463,8 +1468,8 @@ def roll_generic(ndarray[float64_t, cast=True] input,
output = np.empty(N, dtype=float)
if is_variable:
+ # variable window arr or series
- # variable window
if offset != 0:
raise ValueError("unable to roll_generic with a non-zero offset")
@@ -1473,7 +1478,20 @@ def roll_generic(ndarray[float64_t, cast=True] input,
e = end[i]
if counts[i] >= minp:
- output[i] = func(input[s:e], *args, **kwargs)
+ if raw:
+ output[i] = func(arr[s:e], *args, **kwargs)
+ else:
+ output[i] = func(obj.iloc[s:e], *args, **kwargs)
+ else:
+ output[i] = NaN
+
+ elif not raw:
+ # series
+ for i from 0 <= i < N:
+ if counts[i] >= minp:
+ sl = slice(int_max(i + offset - win + 1, 0),
+ int_min(i + offset + 1, N))
+ output[i] = func(obj.iloc[sl], *args, **kwargs)
else:
output[i] = NaN
@@ -1482,12 +1500,12 @@ def roll_generic(ndarray[float64_t, cast=True] input,
# truncated windows at the beginning, through first full-length window
for i from 0 <= i < (int_min(win, N) - offset):
if counts[i] >= minp:
- output[i] = func(input[0: (i + offset + 1)], *args, **kwargs)
+ output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs)
else:
output[i] = NaN
# remaining full-length windows
- buf = input.data
+ buf = arr.data
bufarr = np.empty(win, dtype=float)
oldbuf = bufarr.data
for i from (win - offset) <= i < (N - offset):
@@ -1502,7 +1520,7 @@ def roll_generic(ndarray[float64_t, cast=True] input,
# truncated windows at the end
for i from int_max(N - offset, 0) <= i < N:
if counts[i] >= minp:
- output[i] = func(input[int_max(i + offset - win + 1, 0): N],
+ output[i] = func(arr[int_max(i + offset - win + 1, 0): N],
*args,
**kwargs)
else:
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index c281bd80cb274..97a764fa7dbe8 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1,4 +1,10 @@
-"""An interface for extending pandas with custom arrays."""
+"""An interface for extending pandas with custom arrays.
+
+.. warning::
+
+ This is an experimental API and subject to breaking changes
+ without warning.
+"""
import numpy as np
from pandas.errors import AbstractMethodError
@@ -14,12 +20,15 @@ class ExtensionArray(object):
with a custom type and will not attempt to coerce them to objects. They
may be stored directly inside a :class:`DataFrame` or :class:`Series`.
+ .. versionadded:: 0.23.0
+
Notes
-----
The interface includes the following abstract methods that must be
implemented by subclasses:
* _constructor_from_sequence
+ * _from_factorized
* __getitem__
* __len__
* dtype
@@ -30,11 +39,21 @@ class ExtensionArray(object):
* _concat_same_type
Some additional methods are available to satisfy pandas' internal, private
- block API.
+ block API:
* _can_hold_na
* _formatting_values
+ Some methods require casting the ExtensionArray to an ndarray of Python
+ objects with ``self.astype(object)``, which may be expensive. When
+ performance is a concern, we highly recommend overriding the following
+ methods:
+
+ * fillna
+ * unique
+ * factorize / _values_for_factorize
+ * argsort / _values_for_argsort
+
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
@@ -50,10 +69,6 @@ class ExtensionArray(object):
by some other storage type, like Python lists. Pandas makes no
assumptions on how the data are stored, just that it can be converted
to a NumPy array.
-
- Extension arrays should be able to be constructed with instances of
- the class, i.e. ``ExtensionArray(extension_array)`` should return
- an instance, not error.
"""
# '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray.
# Don't override this.
@@ -458,11 +473,23 @@ def take(self, indexer, allow_fill=True, fill_value=None):
Fill value to replace -1 values with. If applicable, this should
use the sentinel missing value for this type.
+ Returns
+ -------
+ ExtensionArray
+
+ Raises
+ ------
+ IndexError
+ When the indexer is out of bounds for the array.
+
Notes
-----
This should follow pandas' semantics where -1 indicates missing values.
Positions where indexer is ``-1`` should be filled with the missing
value for this type.
+ This gives rise to the special case of a take on an empty
+ ExtensionArray that does not raises an IndexError straight away
+ when the `indexer` is all ``-1``.
This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the
indexer is a sequence of values.
@@ -477,6 +504,12 @@ def take(self, indexer, allow_fill=True, fill_value=None):
def take(self, indexer, allow_fill=True, fill_value=None):
indexer = np.asarray(indexer)
mask = indexer == -1
+
+ # take on empty array not handled as desired by numpy
+ # in case of -1 (all missing take)
+ if not len(self) and mask.all():
+ return type(self)([np.nan] * len(indexer))
+
result = self.data.take(indexer)
result[mask] = np.nan # NA for this type
return type(self)(result)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 0e60f16891a52..41d67c15c55b5 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -411,7 +411,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as e:
exc = TypeError('DataFrame constructor called with '
- 'incompatible data and dtype: %s' % e)
+ 'incompatible data and dtype: {e}'.format(e=e))
raise_with_traceback(exc)
if arr.ndim == 0 and index is not None and columns is not None:
@@ -520,8 +520,9 @@ def _get_axes(N, K, index=index, columns=columns):
try:
values = values.astype(dtype)
except Exception as orig:
- e = ValueError("failed to cast to '%s' (Exception was: %s)"
- % (dtype, orig))
+ e = ValueError("failed to cast to '{dtype}' (Exception "
+ "was: {orig})".format(dtype=dtype,
+ orig=orig))
raise_with_traceback(e)
index, columns = _get_axes(*values.shape)
@@ -873,8 +874,9 @@ def dot(self, other):
lvals = self.values
rvals = np.asarray(other)
if lvals.shape[1] != rvals.shape[0]:
- raise ValueError('Dot product shape mismatch, %s vs %s' %
- (lvals.shape, rvals.shape))
+ raise ValueError('Dot product shape mismatch, '
+ '{l} vs {r}'.format(l=lvals.shape,
+ r=rvals.shape))
if isinstance(other, DataFrame):
return self._constructor(np.dot(lvals, rvals), index=left.index,
@@ -888,7 +890,7 @@ def dot(self, other):
else:
return Series(result, index=left.index)
else: # pragma: no cover
- raise TypeError('unsupported type: %s' % type(other))
+ raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
def __matmul__(self, other):
""" Matrix multiplication using binary `@` operator in Python>=3.5 """
@@ -1098,7 +1100,7 @@ def to_dict(self, orient='dict', into=dict):
return into_c((t[0], dict(zip(self.columns, t[1:])))
for t in self.itertuples())
else:
- raise ValueError("orient '%s' not understood" % orient)
+ raise ValueError("orient '{o}' not understood".format(o=orient))
def to_gbq(self, destination_table, project_id, chunksize=None,
verbose=None, reauth=False, if_exists='fail', private_key=None,
@@ -2140,7 +2142,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
lines.append(self.index._summary())
if len(self.columns) == 0:
- lines.append('Empty %s' % type(self).__name__)
+ lines.append('Empty {name}'.format(name=type(self).__name__))
fmt.buffer_put_lines(buf, lines)
return
@@ -2166,13 +2168,15 @@ def _verbose_repr():
space = max(len(pprint_thing(k)) for k in self.columns) + 4
counts = None
- tmpl = "%s%s"
+ tmpl = "{count}{dtype}"
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
- raise AssertionError('Columns must equal counts (%d != %d)'
- % (len(cols), len(counts)))
- tmpl = "%s non-null %s"
+ raise AssertionError(
+ 'Columns must equal counts '
+ '({cols:d} != {counts:d})'.format(
+ cols=len(cols), counts=len(counts)))
+ tmpl = "{count} non-null {dtype}"
dtypes = self.dtypes
for i, col in enumerate(self.columns):
@@ -2183,7 +2187,8 @@ def _verbose_repr():
if show_counts:
count = counts.iloc[i]
- lines.append(_put_str(col, space) + tmpl % (count, dtype))
+ lines.append(_put_str(col, space) + tmpl.format(count=count,
+ dtype=dtype))
def _non_verbose_repr():
lines.append(self.columns._summary(name='Columns'))
@@ -2192,9 +2197,12 @@ def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
- return "%3.1f%s %s" % (num, size_qualifier, x)
+ return ("{num:3.1f}{size_q}"
+ "{x}".format(num=num, size_q=size_qualifier, x=x))
num /= 1024.0
- return "%3.1f%s %s" % (num, size_qualifier, 'PB')
+ return "{num:3.1f}{size_q} {pb}".format(num=num,
+ size_q=size_qualifier,
+ pb='PB')
if verbose:
_verbose_repr()
@@ -2207,8 +2215,9 @@ def _sizeof_fmt(num, size_qualifier):
_verbose_repr()
counts = self.get_dtype_counts()
- dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))]
- lines.append('dtypes: %s' % ', '.join(dtypes))
+ dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
+ in sorted(compat.iteritems(counts))]
+ lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
if memory_usage is None:
memory_usage = get_option('display.memory_usage')
@@ -2226,8 +2235,9 @@ def _sizeof_fmt(num, size_qualifier):
self.index._is_memory_usage_qualified()):
size_qualifier = '+'
mem_usage = self.memory_usage(index=True, deep=deep).sum()
- lines.append("memory usage: %s\n" %
- _sizeof_fmt(mem_usage, size_qualifier))
+ lines.append("memory usage: {mem}\n".format(
+ mem=_sizeof_fmt(mem_usage, size_qualifier)))
+
fmt.buffer_put_lines(buf, lines)
def memory_usage(self, index=True, deep=False):
@@ -3013,8 +3023,8 @@ def select_dtypes(self, include=None, exclude=None):
# can't both include AND exclude!
if not include.isdisjoint(exclude):
- raise ValueError('include and exclude overlap on %s' %
- (include & exclude))
+ raise ValueError('include and exclude overlap on {inc_ex}'.format(
+ inc_ex=(include & exclude)))
# empty include/exclude -> defaults to True
# three cases (we've already raised if both are empty)
@@ -3331,7 +3341,11 @@ def reindexer(value):
value = reindexer(value).T
elif isinstance(value, ExtensionArray):
+ from pandas.core.series import _sanitize_index
+ # Explicitly copy here, instead of in _sanitize_index,
+ # as sanitize_index won't copy an EA, even with copy=True
value = value.copy()
+ value = _sanitize_index(value, self.index, copy=False)
elif isinstance(value, Index) or is_sequence(value):
from pandas.core.series import _sanitize_index
@@ -3865,7 +3879,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if verify_integrity and not index.is_unique:
duplicates = index.get_duplicates()
- raise ValueError('Index has duplicate keys: %s' % duplicates)
+ raise ValueError('Index has duplicate keys: {dup}'.format(
+ dup=duplicates))
for c in to_remove:
del frame[c]
@@ -4237,7 +4252,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
mask = count > 0
else:
if how is not None:
- raise ValueError('invalid how option: %s' % how)
+ raise ValueError('invalid how option: {h}'.format(h=how))
else:
raise TypeError('must specify how or thresh')
@@ -6123,8 +6138,11 @@ def append(self, other, ignore_index=False, verify_integrity=False):
# index name will be reset
index = Index([other.name], name=self.index.name)
- combined_columns = self.columns.tolist() + self.columns.union(
- other.index).difference(self.columns).tolist()
+ idx_diff = other.index.difference(self.columns)
+ try:
+ combined_columns = self.columns.append(idx_diff)
+ except TypeError:
+ combined_columns = self.columns.astype(object).append(idx_diff)
other = other.reindex(combined_columns, copy=False)
other = DataFrame(other.values.reshape((1, len(other))),
index=index,
@@ -6746,8 +6764,8 @@ def _count_level(self, level, axis=0, numeric_only=False):
agg_axis = frame._get_agg_axis(axis)
if not isinstance(count_axis, MultiIndex):
- raise TypeError("Can only count levels on hierarchical %s." %
- self._get_axis_name(axis))
+ raise TypeError("Can only count levels on hierarchical "
+ "{ax}.".format(ax=self._get_axis_name(axis)))
if frame._is_mixed_type:
# Since we have mixed types, calling notna(frame.values) might
@@ -6825,9 +6843,9 @@ def f(x):
elif filter_type == 'bool':
data = self._get_bool_data()
else: # pragma: no cover
- e = NotImplementedError("Handling exception with filter_"
- "type %s not implemented." %
- filter_type)
+ e = NotImplementedError(
+ "Handling exception with filter_type {f} not"
+ "implemented.".format(f=filter_type))
raise_with_traceback(e)
with np.errstate(all='ignore'):
result = f(data.values)
@@ -6839,8 +6857,8 @@ def f(x):
elif filter_type == 'bool':
data = self._get_bool_data()
else: # pragma: no cover
- msg = ("Generating numeric_only data with filter_type %s"
- "not supported." % filter_type)
+ msg = ("Generating numeric_only data with filter_type {f}"
+ "not supported.".format(f=filter_type))
raise NotImplementedError(msg)
values = data.values
labels = data._get_agg_axis(axis)
@@ -7115,7 +7133,8 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
elif axis == 1:
new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
else: # pragma: no cover
- raise AssertionError('Axis must be 0 or 1. Got %s' % str(axis))
+ raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
+ ax=axis))
return self._constructor(new_data)
@@ -7146,7 +7165,8 @@ def to_period(self, freq=None, axis=0, copy=True):
elif axis == 1:
new_data.set_axis(0, self.columns.to_period(freq=freq))
else: # pragma: no cover
- raise AssertionError('Axis must be 0 or 1. Got %s' % str(axis))
+ raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
+ ax=axis))
return self._constructor(new_data)
@@ -7505,8 +7525,9 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None):
else:
if len(columns) != len(content): # pragma: no cover
# caller's responsibility to check for this...
- raise AssertionError('%d columns passed, passed data had %s '
- 'columns' % (len(columns), len(content)))
+ raise AssertionError('{col:d} columns passed, passed data had '
+ '{con} columns'.format(col=len(columns),
+ con=len(content)))
# provide soft conversion of object dtypes
def convert(arr):
@@ -7581,4 +7602,4 @@ def _from_nested_dict(data):
def _put_str(s, space):
- return ('%s' % s)[:space].ljust(space)
+ return u'{s}'.format(s=s)[:space].ljust(space)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index ae9d160db08e9..2b683d3a606b1 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -4292,6 +4292,8 @@ def pipe(self, func, *args, **kwargs):
Notes
-----
`agg` is an alias for `aggregate`. Use the alias.
+
+ A passed user-defined-function will be passed a Series for evaluation.
""")
_shared_docs['transform'] = ("""
@@ -5287,6 +5289,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
See Also
--------
+ interpolate : Fill NaN values using interpolation.
reindex, asfreq
Returns
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 7c89cab6b1428..8c20d62117e25 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1841,24 +1841,27 @@ def cumcount(self, ascending=True):
@Appender(_doc_template)
def rank(self, method='average', ascending=True, na_option='keep',
pct=False, axis=0):
- """Provides the rank of values within each group
+ """
+ Provides the rank of values within each group.
Parameters
----------
- method : {'average', 'min', 'max', 'first', 'dense'}, efault 'average'
+ method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
* average: average rank of group
* min: lowest rank in group
* max: highest rank in group
* first: ranks assigned in order they appear in the array
* dense: like 'min', but rank always increases by 1 between groups
- method : {'keep', 'top', 'bottom'}, default 'keep'
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
* keep: leave NA values where they are
* top: smallest rank if ascending
* bottom: smallest rank if descending
- ascending : boolean, default True
- False for ranks by high (1) to low (N)
pct : boolean, default False
Compute percentage rank of data within each group
+ axis : int, default 0
+ The axis of the object over which to compute the rank.
Returns
-----
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index 95e1f8438c704..95186b2e79a16 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -342,7 +342,8 @@ def _format_with_header(self, header, **kwargs):
def __contains__(self, key):
try:
res = self.get_loc(key)
- return is_scalar(res) or type(res) == slice or np.any(res)
+ return (is_scalar(res) or isinstance(res, slice) or
+ (is_list_like(res) and len(res)))
except (KeyError, TypeError, ValueError):
return False
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 88ea3511d4ee3..e0e7ba3e8b518 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -302,7 +302,7 @@ def _add_comparison_methods(cls):
_engine_type = libindex.DatetimeEngine
tz = None
- offset = None
+ _freq = None
_comparables = ['name', 'freqstr', 'tz']
_attributes = ['name', 'freq', 'tz']
@@ -415,7 +415,7 @@ def __new__(cls, data=None,
subarr = data.values
if freq is None:
- freq = data.offset
+ freq = data.freq
verify_integrity = False
else:
if data.dtype != _NS_DTYPE:
@@ -467,12 +467,12 @@ def __new__(cls, data=None,
if freq_infer:
inferred = subarr.inferred_freq
if inferred:
- subarr.offset = to_offset(inferred)
+ subarr.freq = to_offset(inferred)
return subarr._deepcopy_if_needed(ref_to_data, copy)
@classmethod
- def _generate(cls, start, end, periods, name, offset,
+ def _generate(cls, start, end, periods, name, freq,
tz=None, normalize=False, ambiguous='raise', closed=None):
if com._count_not_none(start, end, periods) != 2:
raise ValueError('Of the three parameters: start, end, and '
@@ -535,7 +535,7 @@ def _generate(cls, start, end, periods, name, offset,
else:
_normalized = _normalized and end.time() == _midnight
- if hasattr(offset, 'delta') and offset != offsets.Day():
+ if hasattr(freq, 'delta') and freq != offsets.Day():
if inferred_tz is None and tz is not None:
# naive dates
if start is not None and start.tz is None:
@@ -551,11 +551,11 @@ def _generate(cls, start, end, periods, name, offset,
if end.tz is None and start.tz is not None:
end = end.tz_localize(start.tz, ambiguous=False)
- if _use_cached_range(offset, _normalized, start, end):
+ if _use_cached_range(freq, _normalized, start, end):
index = cls._cached_range(start, end, periods=periods,
- offset=offset, name=name)
+ freq=freq, name=name)
else:
- index = _generate_regular_range(start, end, periods, offset)
+ index = _generate_regular_range(start, end, periods, freq)
else:
@@ -574,11 +574,11 @@ def _generate(cls, start, end, periods, name, offset,
if end.tz is None and start.tz is not None:
start = start.replace(tzinfo=None)
- if _use_cached_range(offset, _normalized, start, end):
+ if _use_cached_range(freq, _normalized, start, end):
index = cls._cached_range(start, end, periods=periods,
- offset=offset, name=name)
+ freq=freq, name=name)
else:
- index = _generate_regular_range(start, end, periods, offset)
+ index = _generate_regular_range(start, end, periods, freq)
if tz is not None and getattr(index, 'tz', None) is None:
index = conversion.tz_localize_to_utc(_ensure_int64(index), tz,
@@ -596,12 +596,12 @@ def _generate(cls, start, end, periods, name, offset,
index = index[1:]
if not right_closed and len(index) and index[-1] == end:
index = index[:-1]
- index = cls._simple_new(index, name=name, freq=offset, tz=tz)
+ index = cls._simple_new(index, name=name, freq=freq, tz=tz)
return index
@property
def _box_func(self):
- return lambda x: Timestamp(x, freq=self.offset, tz=self.tz)
+ return lambda x: Timestamp(x, freq=self.freq, tz=self.tz)
def _convert_for_op(self, value):
""" Convert value to be insertable to ndarray """
@@ -647,7 +647,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None,
result = object.__new__(cls)
result._data = values
result.name = name
- result.offset = freq
+ result._freq = freq
result._tz = timezones.maybe_get_tz(tz)
result._tz = timezones.tz_standardize(result._tz)
result._reset_identity()
@@ -734,7 +734,7 @@ def _has_same_tz(self, other):
return zzone == vzone
@classmethod
- def _cached_range(cls, start=None, end=None, periods=None, offset=None,
+ def _cached_range(cls, start=None, end=None, periods=None, freq=None,
name=None):
if start is None and end is None:
# I somewhat believe this should never be raised externally
@@ -747,30 +747,30 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
raise TypeError(
'Must either specify period or provide both start and end.')
- if offset is None:
+ if freq is None:
# This can't happen with external-facing code
- raise TypeError('Must provide offset.')
+ raise TypeError('Must provide freq.')
drc = _daterange_cache
- if offset not in _daterange_cache:
- xdr = generate_range(offset=offset, start=_CACHE_START,
+ if freq not in _daterange_cache:
+ xdr = generate_range(offset=freq, start=_CACHE_START,
end=_CACHE_END)
arr = tools.to_datetime(list(xdr), box=False)
cachedRange = DatetimeIndex._simple_new(arr)
- cachedRange.offset = offset
+ cachedRange.freq = freq
cachedRange = cachedRange.tz_localize(None)
cachedRange.name = None
- drc[offset] = cachedRange
+ drc[freq] = cachedRange
else:
- cachedRange = drc[offset]
+ cachedRange = drc[freq]
if start is None:
if not isinstance(end, Timestamp):
raise AssertionError('end must be an instance of Timestamp')
- end = offset.rollback(end)
+ end = freq.rollback(end)
endLoc = cachedRange.get_loc(end) + 1
startLoc = endLoc - periods
@@ -778,23 +778,23 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
if not isinstance(start, Timestamp):
raise AssertionError('start must be an instance of Timestamp')
- start = offset.rollforward(start)
+ start = freq.rollforward(start)
startLoc = cachedRange.get_loc(start)
endLoc = startLoc + periods
else:
- if not offset.onOffset(start):
- start = offset.rollforward(start)
+ if not freq.onOffset(start):
+ start = freq.rollforward(start)
- if not offset.onOffset(end):
- end = offset.rollback(end)
+ if not freq.onOffset(end):
+ end = freq.rollback(end)
startLoc = cachedRange.get_loc(start)
endLoc = cachedRange.get_loc(end) + 1
indexSlice = cachedRange[startLoc:endLoc]
indexSlice.name = name
- indexSlice.offset = offset
+ indexSlice.freq = freq
return indexSlice
@@ -836,7 +836,7 @@ def __setstate__(self, state):
np.ndarray.__setstate__(data, nd_state)
self.name = own_state[0]
- self.offset = own_state[1]
+ self.freq = own_state[1]
self._tz = timezones.tz_standardize(own_state[2])
# provide numpy < 1.7 compat
@@ -1184,7 +1184,7 @@ def union(self, other):
result._tz = timezones.tz_standardize(this.tz)
if (result.freq is None and
(this.freq is not None or other.freq is not None)):
- result.offset = to_offset(result.inferred_freq)
+ result.freq = to_offset(result.inferred_freq)
return result
def to_perioddelta(self, freq):
@@ -1232,7 +1232,7 @@ def union_many(self, others):
this._tz = timezones.tz_standardize(tz)
if this.freq is None:
- this.offset = to_offset(this.inferred_freq)
+ this.freq = to_offset(this.inferred_freq)
return this
def join(self, other, how='left', level=None, return_indexers=False,
@@ -1271,7 +1271,7 @@ def _maybe_utc_convert(self, other):
def _wrap_joined_index(self, joined, other):
name = self.name if self.name == other.name else None
if (isinstance(other, DatetimeIndex) and
- self.offset == other.offset and
+ self.freq == other.freq and
self._can_fast_union(other)):
joined = self._shallow_copy(joined)
joined.name = name
@@ -1284,9 +1284,9 @@ def _can_fast_union(self, other):
if not isinstance(other, DatetimeIndex):
return False
- offset = self.offset
+ freq = self.freq
- if offset is None or offset != other.offset:
+ if freq is None or freq != other.freq:
return False
if not self.is_monotonic or not other.is_monotonic:
@@ -1306,10 +1306,10 @@ def _can_fast_union(self, other):
# Only need to "adjoin", not overlap
try:
- return (right_start == left_end + offset) or right_start in left
+ return (right_start == left_end + freq) or right_start in left
except (ValueError):
- # if we are comparing an offset that does not propagate timezones
+ # if we are comparing a freq that does not propagate timezones
# this will raise
return False
@@ -1329,7 +1329,7 @@ def _fast_union(self, other):
left_start, left_end = left[0], left[-1]
right_end = right[-1]
- if not self.offset._should_cache():
+ if not self.freq._should_cache():
# concatenate dates
if left_end < right_end:
loc = right.searchsorted(left_end, side='right')
@@ -1341,7 +1341,7 @@ def _fast_union(self, other):
else:
return type(self)(start=left_start,
end=max(left_end, right_end),
- freq=left.offset)
+ freq=left.freq)
def __iter__(self):
"""
@@ -1393,18 +1393,18 @@ def intersection(self, other):
result = Index.intersection(self, other)
if isinstance(result, DatetimeIndex):
if result.freq is None:
- result.offset = to_offset(result.inferred_freq)
+ result.freq = to_offset(result.inferred_freq)
return result
- elif (other.offset is None or self.offset is None or
- other.offset != self.offset or
- not other.offset.isAnchored() or
+ elif (other.freq is None or self.freq is None or
+ other.freq != self.freq or
+ not other.freq.isAnchored() or
(not self.is_monotonic or not other.is_monotonic)):
result = Index.intersection(self, other)
result = self._shallow_copy(result._values, name=result.name,
tz=result.tz, freq=None)
if result.freq is None:
- result.offset = to_offset(result.inferred_freq)
+ result.freq = to_offset(result.inferred_freq)
return result
if len(self) == 0:
@@ -1729,12 +1729,28 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
@property
def freq(self):
"""get/set the frequency of the Index"""
- return self.offset
+ return self._freq
@freq.setter
def freq(self, value):
"""get/set the frequency of the Index"""
- self.offset = value
+ self._freq = value
+
+ @property
+ def offset(self):
+ """get/set the frequency of the Index"""
+ msg = ('DatetimeIndex.offset has been deprecated and will be removed '
+ 'in a future version; use DatetimeIndex.freq instead.')
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ return self.freq
+
+ @offset.setter
+ def offset(self, value):
+ """get/set the frequency of the Index"""
+ msg = ('DatetimeIndex.offset has been deprecated and will be removed '
+ 'in a future version; use DatetimeIndex.freq instead.')
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ self.freq = value
year = _field_accessor('year', 'Y', "The year of the datetime")
month = _field_accessor('month', 'M',
@@ -2525,9 +2541,9 @@ def day_name(self, locale=None):
DatetimeIndex._add_datetimelike_methods()
-def _generate_regular_range(start, end, periods, offset):
- if isinstance(offset, Tick):
- stride = offset.nanos
+def _generate_regular_range(start, end, periods, freq):
+ if isinstance(freq, Tick):
+ stride = freq.nanos
if periods is None:
b = Timestamp(start).value
# cannot just use e = Timestamp(end) + 1 because arange breaks when
@@ -2558,7 +2574,7 @@ def _generate_regular_range(start, end, periods, offset):
end = end.to_pydatetime()
xdr = generate_range(start=start, end=end,
- periods=periods, offset=offset)
+ periods=periods, offset=freq)
dates = list(xdr)
# utc = len(dates) > 0 and dates[0].tzinfo is not None
@@ -2855,9 +2871,9 @@ def _in_range(start, end, rng_start, rng_end):
return start > rng_start and end < rng_end
-def _use_cached_range(offset, _normalized, start, end):
- return (offset._should_cache() and
- not (offset._normalize_cache and not _normalized) and
+def _use_cached_range(freq, _normalized, start, end):
+ return (freq._should_cache() and
+ not (freq._normalize_cache and not _normalized) and
_naive_in_cache_range(start, end))
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 9736bba78ab72..2eb52ecc6bcc7 100755
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -532,7 +532,8 @@ def setter(item, v):
def can_do_equal_len():
""" return True if we have an equal len settable """
- if not len(labels) == 1 or not np.iterable(value):
+ if (not len(labels) == 1 or not np.iterable(value) or
+ is_scalar(plane_indexer[0])):
return False
l = len(value)
@@ -2310,6 +2311,49 @@ def check_bool_indexer(ax, key):
return result
+def check_setitem_lengths(indexer, value, values):
+ """Validate that value and indexer are the same length.
+
+ An special-case is allowed for when the indexer is a boolean array
+ and the number of true values equals the length of ``value``. In
+ this case, no exception is raised.
+
+ Parameters
+ ----------
+ indexer : sequence
+ The key for the setitem
+ value : array-like
+ The value for the setitem
+ values : array-like
+ The values being set into
+
+ Returns
+ -------
+ None
+
+ Raises
+ ------
+ ValueError
+ When the indexer is an ndarray or list and the lengths don't
+ match.
+ """
+ # boolean with truth values == len of the value is ok too
+ if isinstance(indexer, (np.ndarray, list)):
+ if is_list_like(value) and len(indexer) != len(value):
+ if not (isinstance(indexer, np.ndarray) and
+ indexer.dtype == np.bool_ and
+ len(indexer[indexer]) == len(value)):
+ raise ValueError("cannot set using a list-like indexer "
+ "with a different length than the value")
+ # slice
+ elif isinstance(indexer, slice):
+
+ if is_list_like(value) and len(values):
+ if len(value) != length_of_indexer(indexer, values):
+ raise ValueError("cannot set using a slice indexer with a "
+ "different length than the value")
+
+
def convert_missing_indexer(indexer):
""" reverse convert a missing indexer, which is a dict
return the scalar indexer and a boolean indicating if we converted
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index e8fab3748bacf..37d11296400be 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -66,7 +66,7 @@
import pandas.core.algorithms as algos
from pandas.core.index import Index, MultiIndex, _ensure_index
-from pandas.core.indexing import maybe_convert_indices, length_of_indexer
+from pandas.core.indexing import maybe_convert_indices, check_setitem_lengths
from pandas.core.arrays import Categorical
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex
@@ -817,11 +817,24 @@ def _replace_single(self, *args, **kwargs):
return self if kwargs['inplace'] else self.copy()
def setitem(self, indexer, value, mgr=None):
- """ set the value inplace; return a new block (of a possibly different
- dtype)
+ """Set the value inplace, returning a a maybe different typed block.
- indexer is a direct slice/positional indexer; value must be a
- compatible shape
+ Parameters
+ ----------
+ indexer : tuple, list-like, array-like, slice
+ The subset of self.values to set
+ value : object
+ The value being set
+ mgr : BlockPlacement, optional
+
+ Returns
+ -------
+ Block
+
+ Notes
+ -----
+ `indexer` is a direct slice/positional indexer. `value` must
+ be a compatible shape.
"""
# coerce None values, if appropriate
if value is None:
@@ -876,22 +889,7 @@ def setitem(self, indexer, value, mgr=None):
values = transf(values)
# length checking
- # boolean with truth values == len of the value is ok too
- if isinstance(indexer, (np.ndarray, list)):
- if is_list_like(value) and len(indexer) != len(value):
- if not (isinstance(indexer, np.ndarray) and
- indexer.dtype == np.bool_ and
- len(indexer[indexer]) == len(value)):
- raise ValueError("cannot set using a list-like indexer "
- "with a different length than the value")
-
- # slice
- elif isinstance(indexer, slice):
-
- if is_list_like(value) and len(values):
- if len(value) != length_of_indexer(indexer, values):
- raise ValueError("cannot set using a slice indexer with a "
- "different length than the value")
+ check_setitem_lengths(indexer, value, values)
def _is_scalar_indexer(indexer):
# return True if we are all scalar indexers
@@ -1900,6 +1898,37 @@ def is_view(self):
"""Extension arrays are never treated as views."""
return False
+ def setitem(self, indexer, value, mgr=None):
+ """Set the value inplace, returning a same-typed block.
+
+ This differs from Block.setitem by not allowing setitem to change
+ the dtype of the Block.
+
+ Parameters
+ ----------
+ indexer : tuple, list-like, array-like, slice
+ The subset of self.values to set
+ value : object
+ The value being set
+ mgr : BlockPlacement, optional
+
+ Returns
+ -------
+ Block
+
+ Notes
+ -----
+ `indexer` is a direct slice/positional indexer. `value` must
+ be a compatible shape.
+ """
+ if isinstance(indexer, tuple):
+ # we are always 1-D
+ indexer = indexer[0]
+
+ check_setitem_lengths(indexer, value, self.values)
+ self.values[indexer] = value
+ return self
+
def get_values(self, dtype=None):
# ExtensionArrays must be iterable, so this works.
values = np.asarray(self.values)
@@ -3519,7 +3548,8 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False,
# with a .values attribute.
aligned_args = dict((k, kwargs[k])
for k in align_keys
- if hasattr(kwargs[k], 'values'))
+ if hasattr(kwargs[k], 'values') and
+ not isinstance(kwargs[k], ABCExtensionArray))
for b in self.blocks:
if filter is not None:
@@ -5220,7 +5250,7 @@ def _safe_reshape(arr, new_shape):
If possible, reshape `arr` to have shape `new_shape`,
with a couple of exceptions (see gh-13012):
- 1) If `arr` is a Categorical or Index, `arr` will be
+ 1) If `arr` is a ExtensionArray or Index, `arr` will be
returned as is.
2) If `arr` is a Series, the `_values` attribute will
be reshaped and returned.
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 0d0023b9f67d3..f8d283e932f44 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -659,6 +659,7 @@ def fillna(self, method, limit=None):
pad : Forward fill NaN values in the resampled data.
nearest : Fill NaN values in the resampled data
with nearest neighbor starting from center.
+ interpolate : Fill NaN values using interpolation.
pandas.Series.fillna : Fill NaN values in the Series using the
specified method, which can be 'bfill' and 'ffill'.
pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 74a9b59d3194a..6c3401bdc58e3 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -29,6 +29,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
index = _convert_by(index)
columns = _convert_by(columns)
+ num_rows = data.reindex(index, axis='columns').shape[0]
+ num_columns = data.reindex(columns, axis='columns').shape[0]
+ if num_rows * num_columns > (2 ** 31 - 1):
+ raise ValueError('Pivot table is too big, causing int32 overflow')
+
if isinstance(aggfunc, list):
pieces = []
keys = []
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 389f1af48434a..fe9fc9b674f50 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -127,6 +127,11 @@ def __init__(self, values, index, level=-1, value_columns=None,
self.removed_level = self.new_index_levels.pop(self.level)
self.removed_level_full = index.levels[self.level]
+ num_rows = np.max([index_level.size for index_level in self.new_index_levels])
+ num_columns = self.removed_level.size
+ if num_rows * num_columns > (2 ** 31 - 1):
+ raise ValueError('Unstacked data frame is too big, causing int32 overflow')
+
self._make_sorted_values_labels()
self._make_selectors()
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 1d6f770d92795..2ed4c99b7a998 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -3130,7 +3130,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
>>> def add_custom_values(x, **kwargs):
... for month in kwargs:
... x+=kwargs[month]
- ... return x
+ ... return x
>>> series.apply(add_custom_values, june=30, july=20, august=25)
London 95
diff --git a/pandas/core/window.py b/pandas/core/window.py
index 5cd4fffb5d7dd..f8b5aa292f309 100644
--- a/pandas/core/window.py
+++ b/pandas/core/window.py
@@ -314,7 +314,7 @@ def _center_window(self, result, window):
def aggregate(self, arg, *args, **kwargs):
result, how = self._aggregate(arg, *args, **kwargs)
if result is None:
- return self.apply(arg, args=args, kwargs=kwargs)
+ return self.apply(arg, raw=False, args=args, kwargs=kwargs)
return result
agg = aggregate
@@ -954,23 +954,53 @@ def count(self):
Parameters
----------
func : function
- Must produce a single value from an ndarray input
- \*args and \*\*kwargs are passed to the function""")
+ Must produce a single value from an ndarray input if ``raw=True``
+ or a Series if ``raw=False``
+ raw : bool, default None
+ * ``False`` : passes each row or column as a Series to the
+ function.
+ * ``True`` or ``None`` : the passed function will receive ndarray
+ objects instead.
+ If you are just applying a NumPy reduction function this will
+ achieve much better performance.
+
+ The `raw` parameter is required and will show a FutureWarning if
+ not passed. In the future `raw` will default to False.
+
+ .. versionadded:: 0.23.0
+
+ \*args and \*\*kwargs are passed to the function""")
+
+ def apply(self, func, raw=None, args=(), kwargs={}):
+ from pandas import Series
- def apply(self, func, args=(), kwargs={}):
# TODO: _level is unused?
_level = kwargs.pop('_level', None) # noqa
window = self._get_window()
offset = _offset(window, self.center)
index, indexi = self._get_index()
+ # TODO: default is for backward compat
+ # change to False in the future
+ if raw is None:
+ warnings.warn(
+ "Currently, 'apply' passes the values as ndarrays to the "
+ "applied function. In the future, this will change to passing "
+ "it as Series objects. You need to specify 'raw=True' to keep "
+ "the current behaviour, and you can pass 'raw=False' to "
+ "silence this warning", FutureWarning, stacklevel=3)
+ raw = True
+
def f(arg, window, min_periods, closed):
minp = _use_window(min_periods, window)
- return _window.roll_generic(arg, window, minp, indexi, closed,
- offset, func, args, kwargs)
+ if not raw:
+ arg = Series(arg, index=self.obj.index)
+ return _window.roll_generic(
+ arg, window, minp, indexi,
+ closed, offset, func, raw, args, kwargs)
return self._apply(f, func, args=args, kwargs=kwargs,
- center=False)
+ center=False, raw=raw)
def sum(self, *args, **kwargs):
nv.validate_window_func('sum', args, kwargs)
@@ -1498,8 +1528,9 @@ def count(self):
@Substitution(name='rolling')
@Appender(_doc_template)
@Appender(_shared_docs['apply'])
- def apply(self, func, args=(), kwargs={}):
- return super(Rolling, self).apply(func, args=args, kwargs=kwargs)
+ def apply(self, func, raw=None, args=(), kwargs={}):
+ return super(Rolling, self).apply(
+ func, raw=raw, args=args, kwargs=kwargs)
@Substitution(name='rolling')
@Appender(_shared_docs['sum'])
@@ -1756,8 +1787,9 @@ def count(self, **kwargs):
@Substitution(name='expanding')
@Appender(_doc_template)
@Appender(_shared_docs['apply'])
- def apply(self, func, args=(), kwargs={}):
- return super(Expanding, self).apply(func, args=args, kwargs=kwargs)
+ def apply(self, func, raw=None, args=(), kwargs={}):
+ return super(Expanding, self).apply(
+ func, raw=raw, args=args, kwargs=kwargs)
@Substitution(name='expanding')
@Appender(_shared_docs['sum'])
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index f9a496edb45a3..4004a6ea8f6ff 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -887,7 +887,10 @@ def remove(self, key, where=None, start=None, stop=None):
where = _ensure_term(where, scope_level=1)
try:
s = self.get_storer(key)
- except:
+ except KeyError:
+ # the key is not a valid store, re-raising KeyError
+ raise
+ except Exception:
if where is not None:
raise ValueError(
@@ -899,9 +902,6 @@ def remove(self, key, where=None, start=None, stop=None):
s._f_remove(recursive=True)
return None
- if s is None:
- raise KeyError('No object named %s in the file' % key)
-
# remove the node
if com._all_none(where, start, stop):
s.group._f_remove(recursive=True)
@@ -1094,7 +1094,8 @@ def get_storer(self, key):
""" return the storer object for a key, raise if not in the file """
group = self.get_node(key)
if group is None:
- return None
+ raise KeyError('No object named {} in the file'.format(key))
+
s = self._create_storer(group)
s.infer_axes()
return s
diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
index 590f28b275aec..20cd8b43478d2 100644
--- a/pandas/tests/dtypes/test_cast.py
+++ b/pandas/tests/dtypes/test_cast.py
@@ -191,9 +191,9 @@ def testinfer_dtype_from_scalar_errors(self):
(pd.Categorical(list('aabc')), 'category', True),
(pd.Categorical([1, 2, 3]), 'category', True),
(Timestamp('20160101'), np.object_, False),
- (np.datetime64('2016-01-01'), np.dtype('1
- result = df.groupby("A").B.sum(min_count=2)
- expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
- tm.assert_series_equal(result, expected)
-
- def test_empty_prod(self):
- # https://github.com/pandas-dev/pandas/issues/18678
- df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
- categories=['a', 'b', 'c']),
- 'B': [1, 2, 1]})
-
- expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
-
- # 1 by default
- result = df.groupby("A").B.prod()
- expected = pd.Series([2, 1, 1], expected_idx, name='B')
- tm.assert_series_equal(result, expected)
-
- # min_count=0
- result = df.groupby("A").B.prod(min_count=0)
- expected = pd.Series([2, 1, 1], expected_idx, name='B')
- tm.assert_series_equal(result, expected)
-
- # min_count=1
- result = df.groupby("A").B.prod(min_count=1)
- expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
- tm.assert_series_equal(result, expected)
+ for col in ['C1', 'C2']:
+ result1 = df.groupby(by=col, as_index=False).mean()
+ result2 = df.groupby(by=col, as_index=True).mean().reset_index()
+ expected = exp_full.reindex(columns=result1.columns)
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+
+ # multiple grouper
+ exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2],
+ 'B': [np.nan, 20.0, np.nan, 25.0, np.nan,
+ np.nan],
+ 'C1': Categorical(list("bacbac"),
+ categories=list("bac"),
+ ordered=False),
+ 'C2': Categorical(list("bacbac"),
+ categories=list("bac"),
+ ordered=True)})
+ for cols in [['A', 'C1'], ['A', 'C2']]:
+ result1 = df.groupby(by=cols, as_index=False).mean()
+ result2 = df.groupby(by=cols, as_index=True).mean().reset_index()
+ expected = exp_full.reindex(columns=result1.columns)
+ tm.assert_frame_equal(result1, expected)
+ tm.assert_frame_equal(result2, expected)
+
+
+def test_groupby_categorical_no_compress():
+ data = Series(np.random.randn(9))
+
+ codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
+ cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
+
+ result = data.groupby(cats).mean()
+ exp = data.groupby(codes).mean()
+
+ exp.index = CategoricalIndex(exp.index, categories=cats.categories,
+ ordered=cats.ordered)
+ assert_series_equal(result, exp)
+
+ codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
+ cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
+
+ result = data.groupby(cats).mean()
+ exp = data.groupby(codes).mean().reindex(cats.categories)
+ exp.index = CategoricalIndex(exp.index, categories=cats.categories,
+ ordered=cats.ordered)
+ assert_series_equal(result, exp)
+
+ cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+ categories=["a", "b", "c", "d"], ordered=True)
+ data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
+
+ result = data.groupby("b").mean()
+ result = result["a"].values
+ exp = np.array([1, 2, 4, np.nan])
+ tm.assert_numpy_array_equal(result, exp)
+
+
+def test_groupby_sort_categorical():
+ # dataframe groupby sort was being ignored # GH 8868
+ df = DataFrame([['(7.5, 10]', 10, 10],
+ ['(7.5, 10]', 8, 20],
+ ['(2.5, 5]', 5, 30],
+ ['(5, 7.5]', 6, 40],
+ ['(2.5, 5]', 4, 50],
+ ['(0, 2.5]', 1, 60],
+ ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar'])
+ df['range'] = Categorical(df['range'], ordered=True)
+ index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
+ '(7.5, 10]'], name='range', ordered=True)
+ result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
+ columns=['foo', 'bar'], index=index)
+
+ col = 'range'
+ assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+ # when categories is ordered, group is ordered by category's order
+ assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
+
+ df['range'] = Categorical(df['range'], ordered=False)
+ index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
+ '(7.5, 10]'], name='range')
+ result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
+ columns=['foo', 'bar'], index=index)
+
+ index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
+ '(0, 2.5]'],
+ categories=['(7.5, 10]', '(2.5, 5]',
+ '(5, 7.5]', '(0, 2.5]'],
+ name='range')
+ result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+ index=index, columns=['foo', 'bar'])
+
+ col = 'range'
+ # this is an unordered categorical, but we allow this ####
+ assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+ assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
+
+
+def test_groupby_sort_categorical_datetimelike():
+ # GH10505
+
+ # use same data as test_groupby_sort_categorical, which category is
+ # corresponding to datetime.month
+ df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
+ datetime(2011, 2, 1), datetime(2011, 5, 1),
+ datetime(2011, 2, 1), datetime(2011, 1, 1),
+ datetime(2011, 5, 1)],
+ 'foo': [10, 8, 5, 6, 4, 1, 7],
+ 'bar': [10, 20, 30, 40, 50, 60, 70]},
+ columns=['dt', 'foo', 'bar'])
+
+ # ordered=True
+ df['dt'] = Categorical(df['dt'], ordered=True)
+ index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 7, 1)]
+ result_sort = DataFrame(
+ [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
+ result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
+
+ index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 1, 1)]
+ result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+ columns=['foo', 'bar'])
+ result_nosort.index = CategoricalIndex(index, categories=index,
+ name='dt', ordered=True)
+
+ col = 'dt'
+ assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+ # when categories is ordered, group is ordered by category's order
+ assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
+
+ # ordered = False
+ df['dt'] = Categorical(df['dt'], ordered=False)
+ index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 7, 1)]
+ result_sort = DataFrame(
+ [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
+ result_sort.index = CategoricalIndex(index, name='dt')
+
+ index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
+ datetime(2011, 5, 1), datetime(2011, 1, 1)]
+ result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+ columns=['foo', 'bar'])
+ result_nosort.index = CategoricalIndex(index, categories=index,
+ name='dt')
+
+ col = 'dt'
+ assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+ assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
+
+
+def test_groupby_categorical_two_columns():
+
+ # https://github.com/pandas-dev/pandas/issues/8138
+ d = {'cat':
+ pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
+ ordered=True),
+ 'ints': [1, 1, 2, 2],
+ 'val': [10, 20, 30, 40]}
+ test = pd.DataFrame(d)
+
+ # Grouping on a single column
+ groups_single_key = test.groupby("cat")
+ res = groups_single_key.agg('mean')
+
+ exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat",
+ ordered=True)
+ exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]},
+ index=exp_index)
+ tm.assert_frame_equal(res, exp)
+
+ # Grouping on two columns
+ groups_double_key = test.groupby(["cat", "ints"])
+ res = groups_double_key.agg('mean')
+ exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan],
+ "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"],
+ ordered=True),
+ "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints"
+ ])
+ tm.assert_frame_equal(res, exp)
+
+ # GH 10132
+ for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
+ c, i = key
+ result = groups_double_key.get_group(key)
+ expected = test[(test.cat == c) & (test.ints == i)]
+ assert_frame_equal(result, expected)
+
+ d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
+ test = pd.DataFrame(d)
+ values = pd.cut(test['C1'], [1, 2, 3, 6])
+ values.name = "cat"
+ groups_double_key = test.groupby([values, 'C2'])
+
+ res = groups_double_key.agg('mean')
+ nan = np.nan
+ idx = MultiIndex.from_product(
+ [Categorical([Interval(1, 2), Interval(2, 3),
+ Interval(3, 6)], ordered=True),
+ [1, 2, 3, 4]],
+ names=["cat", "C2"])
+ exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3,
+ nan, nan, nan, nan, 4, 5],
+ "C3": [nan, nan, nan, nan, 10, 100,
+ nan, nan, nan, nan, 200, 34]}, index=idx)
+ tm.assert_frame_equal(res, exp)
+
+
+def test_empty_sum():
+ # https://github.com/pandas-dev/pandas/issues/18678
+ df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
+ categories=['a', 'b', 'c']),
+ 'B': [1, 2, 1]})
+ expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
+
+ # 0 by default
+ result = df.groupby("A").B.sum()
+ expected = pd.Series([3, 1, 0], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=0
+ result = df.groupby("A").B.sum(min_count=0)
+ expected = pd.Series([3, 1, 0], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=1
+ result = df.groupby("A").B.sum(min_count=1)
+ expected = pd.Series([3, 1, np.nan], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count>1
+ result = df.groupby("A").B.sum(min_count=2)
+ expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+
+def test_empty_prod():
+ # https://github.com/pandas-dev/pandas/issues/18678
+ df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
+ categories=['a', 'b', 'c']),
+ 'B': [1, 2, 1]})
+
+ expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
+
+ # 1 by default
+ result = df.groupby("A").B.prod()
+ expected = pd.Series([2, 1, 1], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=0
+ result = df.groupby("A").B.prod(min_count=0)
+ expected = pd.Series([2, 1, 1], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
+
+ # min_count=1
+ result = df.groupby("A").B.prod(min_count=1)
+ expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py
index cac6b46af8f87..873d9f6076b69 100644
--- a/pandas/tests/groupby/test_filters.py
+++ b/pandas/tests/groupby/test_filters.py
@@ -1,622 +1,576 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
-from numpy import nan
-
import pytest
-from pandas import Timestamp
-from pandas.core.index import MultiIndex
-from pandas.core.api import DataFrame
-
-from pandas.core.series import Series
-
-from pandas.util.testing import (assert_frame_equal, assert_series_equal
- )
-from pandas.compat import (lmap)
-
-from pandas import compat
-
-import pandas.core.common as com
import numpy as np
-
import pandas.util.testing as tm
+from pandas import Timestamp, DataFrame, Series
import pandas as pd
-class TestGroupByFilter(object):
-
- def setup_method(self, method):
- self.ts = tm.makeTimeSeries()
-
- self.seriesd = tm.getSeriesData()
- self.tsd = tm.getTimeSeriesData()
- self.frame = DataFrame(self.seriesd)
- self.tsframe = DataFrame(self.tsd)
-
- self.df = DataFrame(
- {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
- 'C': np.random.randn(8),
- 'D': np.random.randn(8)})
-
- self.df_mixed_floats = DataFrame(
- {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
- 'C': np.random.randn(8),
- 'D': np.array(
- np.random.randn(8), dtype='float32')})
-
- index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
- 'three']],
- labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
- [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
- names=['first', 'second'])
- self.mframe = DataFrame(np.random.randn(10, 3), index=index,
- columns=['A', 'B', 'C'])
-
- self.three_group = DataFrame(
- {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
- 'dull', 'shiny', 'shiny', 'shiny'],
- 'D': np.random.randn(11),
- 'E': np.random.randn(11),
- 'F': np.random.randn(11)})
-
- def test_filter_series(self):
- s = pd.Series([1, 3, 20, 5, 22, 24, 7])
- expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
- expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
- grouper = s.apply(lambda x: x % 2)
- grouped = s.groupby(grouper)
- assert_series_equal(
- grouped.filter(lambda x: x.mean() < 10), expected_odd)
- assert_series_equal(
- grouped.filter(lambda x: x.mean() > 10), expected_even)
- # Test dropna=False.
- assert_series_equal(
- grouped.filter(lambda x: x.mean() < 10, dropna=False),
- expected_odd.reindex(s.index))
- assert_series_equal(
- grouped.filter(lambda x: x.mean() > 10, dropna=False),
- expected_even.reindex(s.index))
-
- def test_filter_single_column_df(self):
- df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
- expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
- expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
- grouper = df[0].apply(lambda x: x % 2)
- grouped = df.groupby(grouper)
- assert_frame_equal(
- grouped.filter(lambda x: x.mean() < 10), expected_odd)
- assert_frame_equal(
- grouped.filter(lambda x: x.mean() > 10), expected_even)
- # Test dropna=False.
- assert_frame_equal(
- grouped.filter(lambda x: x.mean() < 10, dropna=False),
- expected_odd.reindex(df.index))
- assert_frame_equal(
- grouped.filter(lambda x: x.mean() > 10, dropna=False),
- expected_even.reindex(df.index))
-
- def test_filter_multi_column_df(self):
- df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]})
- grouper = df['A'].apply(lambda x: x % 2)
- grouped = df.groupby(grouper)
- expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2])
- assert_frame_equal(
- grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10),
- expected)
-
- def test_filter_mixed_df(self):
- df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
- grouper = df['A'].apply(lambda x: x % 2)
- grouped = df.groupby(grouper)
- expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2])
- assert_frame_equal(
- grouped.filter(lambda x: x['A'].sum() > 10), expected)
-
- def test_filter_out_all_groups(self):
- s = pd.Series([1, 3, 20, 5, 22, 24, 7])
- grouper = s.apply(lambda x: x % 2)
- grouped = s.groupby(grouper)
- assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
- df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
- grouper = df['A'].apply(lambda x: x % 2)
- grouped = df.groupby(grouper)
- assert_frame_equal(
- grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]])
-
- def test_filter_out_no_groups(self):
- s = pd.Series([1, 3, 20, 5, 22, 24, 7])
- grouper = s.apply(lambda x: x % 2)
- grouped = s.groupby(grouper)
- filtered = grouped.filter(lambda x: x.mean() > 0)
- assert_series_equal(filtered, s)
- df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
- grouper = df['A'].apply(lambda x: x % 2)
- grouped = df.groupby(grouper)
- filtered = grouped.filter(lambda x: x['A'].mean() > 0)
- assert_frame_equal(filtered, df)
-
- def test_filter_out_all_groups_in_df(self):
- # GH12768
- df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
- res = df.groupby('a')
- res = res.filter(lambda x: x['b'].sum() > 5, dropna=False)
- expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3})
- assert_frame_equal(expected, res)
-
- df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
- res = df.groupby('a')
- res = res.filter(lambda x: x['b'].sum() > 5, dropna=True)
- expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64")
- assert_frame_equal(expected, res)
-
- def test_filter_condition_raises(self):
- def raise_if_sum_is_zero(x):
- if x.sum() == 0:
- raise ValueError
- else:
- return x.sum() > 0
-
- s = pd.Series([-1, 0, 1, 2])
- grouper = s.apply(lambda x: x % 2)
- grouped = s.groupby(grouper)
- pytest.raises(TypeError,
- lambda: grouped.filter(raise_if_sum_is_zero))
-
- def test_filter_with_axis_in_groupby(self):
- # issue 11041
- index = pd.MultiIndex.from_product([range(10), [0, 1]])
- data = pd.DataFrame(
- np.arange(100).reshape(-1, 20), columns=index, dtype='int64')
- result = data.groupby(level=0,
- axis=1).filter(lambda x: x.iloc[0, 0] > 10)
- expected = data.iloc[:, 12:20]
- assert_frame_equal(result, expected)
-
- def test_filter_bad_shapes(self):
- df = DataFrame({'A': np.arange(8),
- 'B': list('aabbbbcc'),
- 'C': np.arange(8)})
- s = df['B']
- g_df = df.groupby('B')
- g_s = s.groupby(s)
-
- f = lambda x: x
- pytest.raises(TypeError, lambda: g_df.filter(f))
- pytest.raises(TypeError, lambda: g_s.filter(f))
-
- f = lambda x: x == 1
- pytest.raises(TypeError, lambda: g_df.filter(f))
- pytest.raises(TypeError, lambda: g_s.filter(f))
-
- f = lambda x: np.outer(x, x)
- pytest.raises(TypeError, lambda: g_df.filter(f))
- pytest.raises(TypeError, lambda: g_s.filter(f))
-
- def test_filter_nan_is_false(self):
- df = DataFrame({'A': np.arange(8),
- 'B': list('aabbbbcc'),
- 'C': np.arange(8)})
- s = df['B']
- g_df = df.groupby(df['B'])
- g_s = s.groupby(s)
-
- f = lambda x: np.nan
- assert_frame_equal(g_df.filter(f), df.loc[[]])
- assert_series_equal(g_s.filter(f), s[[]])
-
- def test_filter_against_workaround(self):
- np.random.seed(0)
- # Series of ints
- s = Series(np.random.randint(0, 100, 1000))
- grouper = s.apply(lambda x: np.round(x, -1))
- grouped = s.groupby(grouper)
- f = lambda x: x.mean() > 10
-
- old_way = s[grouped.transform(f).astype('bool')]
- new_way = grouped.filter(f)
- assert_series_equal(new_way.sort_values(), old_way.sort_values())
-
- # Series of floats
- s = 100 * Series(np.random.random(1000))
- grouper = s.apply(lambda x: np.round(x, -1))
- grouped = s.groupby(grouper)
- f = lambda x: x.mean() > 10
- old_way = s[grouped.transform(f).astype('bool')]
- new_way = grouped.filter(f)
- assert_series_equal(new_way.sort_values(), old_way.sort_values())
-
- # Set up DataFrame of ints, floats, strings.
- from string import ascii_lowercase
- letters = np.array(list(ascii_lowercase))
- N = 1000
- random_letters = letters.take(np.random.randint(0, 26, N))
- df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
- 'floats': N / 10 * Series(np.random.random(N)),
- 'letters': Series(random_letters)})
-
- # Group by ints; filter on floats.
- grouped = df.groupby('ints')
- old_way = df[grouped.floats.
- transform(lambda x: x.mean() > N / 20).astype('bool')]
- new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
- assert_frame_equal(new_way, old_way)
-
- # Group by floats (rounded); filter on strings.
- grouper = df.floats.apply(lambda x: np.round(x, -1))
- grouped = df.groupby(grouper)
- old_way = df[grouped.letters.
- transform(lambda x: len(x) < N / 10).astype('bool')]
- new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
- assert_frame_equal(new_way, old_way)
-
- # Group by strings; filter on ints.
- grouped = df.groupby('letters')
- old_way = df[grouped.ints.
- transform(lambda x: x.mean() > N / 20).astype('bool')]
- new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
- assert_frame_equal(new_way, old_way)
-
- def test_filter_using_len(self):
- # BUG GH4447
- df = DataFrame({'A': np.arange(8),
- 'B': list('aabbbbcc'),
- 'C': np.arange(8)})
- grouped = df.groupby('B')
- actual = grouped.filter(lambda x: len(x) > 2)
- expected = DataFrame(
- {'A': np.arange(2, 6),
- 'B': list('bbbb'),
- 'C': np.arange(2, 6)}, index=np.arange(2, 6))
- assert_frame_equal(actual, expected)
-
- actual = grouped.filter(lambda x: len(x) > 4)
- expected = df.loc[[]]
- assert_frame_equal(actual, expected)
-
- # Series have always worked properly, but we'll test anyway.
- s = df['B']
- grouped = s.groupby(s)
- actual = grouped.filter(lambda x: len(x) > 2)
- expected = Series(4 * ['b'], index=np.arange(2, 6), name='B')
- assert_series_equal(actual, expected)
-
- actual = grouped.filter(lambda x: len(x) > 4)
- expected = s[[]]
- assert_series_equal(actual, expected)
-
- def test_filter_maintains_ordering(self):
- # Simple case: index is sequential. #4621
- df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
- 'tag': [23, 45, 62, 24, 45, 34, 25, 62]})
- s = df['pid']
- grouped = df.groupby('tag')
- actual = grouped.filter(lambda x: len(x) > 1)
- expected = df.iloc[[1, 2, 4, 7]]
- assert_frame_equal(actual, expected)
-
- grouped = s.groupby(df['tag'])
- actual = grouped.filter(lambda x: len(x) > 1)
- expected = s.iloc[[1, 2, 4, 7]]
- assert_series_equal(actual, expected)
-
- # Now index is sequentially decreasing.
- df.index = np.arange(len(df) - 1, -1, -1)
- s = df['pid']
- grouped = df.groupby('tag')
- actual = grouped.filter(lambda x: len(x) > 1)
- expected = df.iloc[[1, 2, 4, 7]]
- assert_frame_equal(actual, expected)
-
- grouped = s.groupby(df['tag'])
- actual = grouped.filter(lambda x: len(x) > 1)
- expected = s.iloc[[1, 2, 4, 7]]
- assert_series_equal(actual, expected)
-
- # Index is shuffled.
- SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
- df.index = df.index[SHUFFLED]
- s = df['pid']
- grouped = df.groupby('tag')
- actual = grouped.filter(lambda x: len(x) > 1)
- expected = df.iloc[[1, 2, 4, 7]]
- assert_frame_equal(actual, expected)
-
- grouped = s.groupby(df['tag'])
- actual = grouped.filter(lambda x: len(x) > 1)
- expected = s.iloc[[1, 2, 4, 7]]
- assert_series_equal(actual, expected)
-
- def test_filter_multiple_timestamp(self):
- # GH 10114
- df = DataFrame({'A': np.arange(5, dtype='int64'),
- 'B': ['foo', 'bar', 'foo', 'bar', 'bar'],
- 'C': Timestamp('20130101')})
-
- grouped = df.groupby(['B', 'C'])
-
- result = grouped['A'].filter(lambda x: True)
- assert_series_equal(df['A'], result)
-
- result = grouped['A'].transform(len)
- expected = Series([2, 3, 2, 3, 3], name='A')
- assert_series_equal(result, expected)
-
- result = grouped.filter(lambda x: True)
- assert_frame_equal(df, result)
-
- result = grouped.transform('sum')
- expected = DataFrame({'A': [2, 8, 2, 8, 8]})
- assert_frame_equal(result, expected)
-
- result = grouped.transform(len)
- expected = DataFrame({'A': [2, 3, 2, 3, 3]})
- assert_frame_equal(result, expected)
-
- def test_filter_and_transform_with_non_unique_int_index(self):
- # GH4620
- index = [1, 1, 1, 2, 1, 1, 0, 1]
- df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
- 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
- grouped_df = df.groupby('tag')
- ser = df['pid']
- grouped_ser = ser.groupby(df['tag'])
- expected_indexes = [1, 2, 4, 7]
-
- # Filter DataFrame
- actual = grouped_df.filter(lambda x: len(x) > 1)
- expected = df.iloc[expected_indexes]
- assert_frame_equal(actual, expected)
-
- actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
- expected = df.copy()
- expected.iloc[[0, 3, 5, 6]] = np.nan
- assert_frame_equal(actual, expected)
-
- # Filter Series
- actual = grouped_ser.filter(lambda x: len(x) > 1)
- expected = ser.take(expected_indexes)
- assert_series_equal(actual, expected)
-
- actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
- NA = np.nan
- expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
- # ^ made manually because this can get confusing!
- assert_series_equal(actual, expected)
-
- # Transform Series
- actual = grouped_ser.transform(len)
- expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
- assert_series_equal(actual, expected)
-
- # Transform (a column from) DataFrameGroupBy
- actual = grouped_df.pid.transform(len)
- assert_series_equal(actual, expected)
-
- def test_filter_and_transform_with_multiple_non_unique_int_index(self):
- # GH4620
- index = [1, 1, 1, 2, 0, 0, 0, 1]
- df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
- 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
- grouped_df = df.groupby('tag')
- ser = df['pid']
- grouped_ser = ser.groupby(df['tag'])
- expected_indexes = [1, 2, 4, 7]
-
- # Filter DataFrame
- actual = grouped_df.filter(lambda x: len(x) > 1)
- expected = df.iloc[expected_indexes]
- assert_frame_equal(actual, expected)
-
- actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
- expected = df.copy()
- expected.iloc[[0, 3, 5, 6]] = np.nan
- assert_frame_equal(actual, expected)
-
- # Filter Series
- actual = grouped_ser.filter(lambda x: len(x) > 1)
- expected = ser.take(expected_indexes)
- assert_series_equal(actual, expected)
-
- actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
- NA = np.nan
- expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
- # ^ made manually because this can get confusing!
- assert_series_equal(actual, expected)
-
- # Transform Series
- actual = grouped_ser.transform(len)
- expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
- assert_series_equal(actual, expected)
-
- # Transform (a column from) DataFrameGroupBy
- actual = grouped_df.pid.transform(len)
- assert_series_equal(actual, expected)
-
- def test_filter_and_transform_with_non_unique_float_index(self):
- # GH4620
- index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
- df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
- 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
- grouped_df = df.groupby('tag')
- ser = df['pid']
- grouped_ser = ser.groupby(df['tag'])
- expected_indexes = [1, 2, 4, 7]
-
- # Filter DataFrame
- actual = grouped_df.filter(lambda x: len(x) > 1)
- expected = df.iloc[expected_indexes]
- assert_frame_equal(actual, expected)
-
- actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
- expected = df.copy()
- expected.iloc[[0, 3, 5, 6]] = np.nan
- assert_frame_equal(actual, expected)
-
- # Filter Series
- actual = grouped_ser.filter(lambda x: len(x) > 1)
- expected = ser.take(expected_indexes)
- assert_series_equal(actual, expected)
-
- actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
- NA = np.nan
- expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
- # ^ made manually because this can get confusing!
- assert_series_equal(actual, expected)
-
- # Transform Series
- actual = grouped_ser.transform(len)
- expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
- assert_series_equal(actual, expected)
-
- # Transform (a column from) DataFrameGroupBy
- actual = grouped_df.pid.transform(len)
- assert_series_equal(actual, expected)
-
- def test_filter_and_transform_with_non_unique_timestamp_index(self):
- # GH4620
- t0 = Timestamp('2013-09-30 00:05:00')
- t1 = Timestamp('2013-10-30 00:05:00')
- t2 = Timestamp('2013-11-30 00:05:00')
- index = [t1, t1, t1, t2, t1, t1, t0, t1]
- df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
- 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
- grouped_df = df.groupby('tag')
- ser = df['pid']
- grouped_ser = ser.groupby(df['tag'])
- expected_indexes = [1, 2, 4, 7]
-
- # Filter DataFrame
- actual = grouped_df.filter(lambda x: len(x) > 1)
- expected = df.iloc[expected_indexes]
- assert_frame_equal(actual, expected)
-
- actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
- expected = df.copy()
- expected.iloc[[0, 3, 5, 6]] = np.nan
- assert_frame_equal(actual, expected)
-
- # Filter Series
- actual = grouped_ser.filter(lambda x: len(x) > 1)
- expected = ser.take(expected_indexes)
- assert_series_equal(actual, expected)
-
- actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
- NA = np.nan
- expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
- # ^ made manually because this can get confusing!
- assert_series_equal(actual, expected)
-
- # Transform Series
- actual = grouped_ser.transform(len)
- expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
- assert_series_equal(actual, expected)
-
- # Transform (a column from) DataFrameGroupBy
- actual = grouped_df.pid.transform(len)
- assert_series_equal(actual, expected)
-
- def test_filter_and_transform_with_non_unique_string_index(self):
- # GH4620
- index = list('bbbcbbab')
- df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
- 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
- grouped_df = df.groupby('tag')
- ser = df['pid']
- grouped_ser = ser.groupby(df['tag'])
- expected_indexes = [1, 2, 4, 7]
-
- # Filter DataFrame
- actual = grouped_df.filter(lambda x: len(x) > 1)
- expected = df.iloc[expected_indexes]
- assert_frame_equal(actual, expected)
-
- actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
- expected = df.copy()
- expected.iloc[[0, 3, 5, 6]] = np.nan
- assert_frame_equal(actual, expected)
-
- # Filter Series
- actual = grouped_ser.filter(lambda x: len(x) > 1)
- expected = ser.take(expected_indexes)
- assert_series_equal(actual, expected)
-
- actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
- NA = np.nan
- expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
- # ^ made manually because this can get confusing!
- assert_series_equal(actual, expected)
-
- # Transform Series
- actual = grouped_ser.transform(len)
- expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
- assert_series_equal(actual, expected)
-
- # Transform (a column from) DataFrameGroupBy
- actual = grouped_df.pid.transform(len)
- assert_series_equal(actual, expected)
-
- def test_filter_has_access_to_grouped_cols(self):
- df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- # previously didn't have access to col A #????
- filt = g.filter(lambda x: x['A'].sum() == 2)
- assert_frame_equal(filt, df.iloc[[0, 1]])
-
- def test_filter_enforces_scalarness(self):
- df = pd.DataFrame([
- ['best', 'a', 'x'],
- ['worst', 'b', 'y'],
- ['best', 'c', 'x'],
- ['best', 'd', 'y'],
- ['worst', 'd', 'y'],
- ['worst', 'd', 'y'],
- ['best', 'd', 'z'],
- ], columns=['a', 'b', 'c'])
- with tm.assert_raises_regex(TypeError,
- 'filter function returned a.*'):
- df.groupby('c').filter(lambda g: g['a'] == 'best')
-
- def test_filter_non_bool_raises(self):
- df = pd.DataFrame([
- ['best', 'a', 1],
- ['worst', 'b', 1],
- ['best', 'c', 1],
- ['best', 'd', 1],
- ['worst', 'd', 1],
- ['worst', 'd', 1],
- ['best', 'd', 1],
- ], columns=['a', 'b', 'c'])
- with tm.assert_raises_regex(TypeError,
- 'filter function returned a.*'):
- df.groupby('a').filter(lambda g: g.c.mean())
-
- def test_filter_dropna_with_empty_groups(self):
- # GH 10780
- data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
- groupped = data.groupby(level=0)
- result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
- expected_false = pd.Series([np.nan] * 9,
- index=np.repeat([1, 2, 3], 3))
- tm.assert_series_equal(result_false, expected_false)
-
- result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
- expected_true = pd.Series(index=pd.Index([], dtype=int))
- tm.assert_series_equal(result_true, expected_true)
-
-
-def assert_fp_equal(a, b):
- assert (np.abs(a - b) < 1e-12).all()
-
-
-def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
- tups = lmap(tuple, df[keys].values)
- tups = com._asarray_tuplesafe(tups)
- expected = f(df.groupby(tups)[field])
- for k, v in compat.iteritems(expected):
- assert (result[k] == v)
+def test_filter_series():
+ s = pd.Series([1, 3, 20, 5, 22, 24, 7])
+ expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
+ expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() < 10), expected_odd)
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() > 10), expected_even)
+ # Test dropna=False.
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() < 10, dropna=False),
+ expected_odd.reindex(s.index))
+ tm.assert_series_equal(
+ grouped.filter(lambda x: x.mean() > 10, dropna=False),
+ expected_even.reindex(s.index))
+
+
+def test_filter_single_column_df():
+ df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
+ expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
+ expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
+ grouper = df[0].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() < 10), expected_odd)
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() > 10), expected_even)
+ # Test dropna=False.
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() < 10, dropna=False),
+ expected_odd.reindex(df.index))
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x.mean() > 10, dropna=False),
+ expected_even.reindex(df.index))
+
+
+def test_filter_multi_column_df():
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2])
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10),
+ expected)
+
+
+def test_filter_mixed_df():
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2])
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x['A'].sum() > 10), expected)
+
+
+def test_filter_out_all_groups():
+ s = pd.Series([1, 3, 20, 5, 22, 24, 7])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ tm.assert_frame_equal(
+ grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]])
+
+
+def test_filter_out_no_groups():
+ s = pd.Series([1, 3, 20, 5, 22, 24, 7])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ filtered = grouped.filter(lambda x: x.mean() > 0)
+ tm.assert_series_equal(filtered, s)
+ df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
+ grouper = df['A'].apply(lambda x: x % 2)
+ grouped = df.groupby(grouper)
+ filtered = grouped.filter(lambda x: x['A'].mean() > 0)
+ tm.assert_frame_equal(filtered, df)
+
+
+def test_filter_out_all_groups_in_df():
+ # GH12768
+ df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
+ res = df.groupby('a')
+ res = res.filter(lambda x: x['b'].sum() > 5, dropna=False)
+ expected = pd.DataFrame({'a': [np.nan] * 3, 'b': [np.nan] * 3})
+ tm.assert_frame_equal(expected, res)
+
+ df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
+ res = df.groupby('a')
+ res = res.filter(lambda x: x['b'].sum() > 5, dropna=True)
+ expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64")
+ tm.assert_frame_equal(expected, res)
+
+
+def test_filter_condition_raises():
+ def raise_if_sum_is_zero(x):
+ if x.sum() == 0:
+ raise ValueError
+ else:
+ return x.sum() > 0
+
+ s = pd.Series([-1, 0, 1, 2])
+ grouper = s.apply(lambda x: x % 2)
+ grouped = s.groupby(grouper)
+ pytest.raises(TypeError,
+ lambda: grouped.filter(raise_if_sum_is_zero))
+
+
+def test_filter_with_axis_in_groupby():
+ # issue 11041
+ index = pd.MultiIndex.from_product([range(10), [0, 1]])
+ data = pd.DataFrame(
+ np.arange(100).reshape(-1, 20), columns=index, dtype='int64')
+ result = data.groupby(level=0,
+ axis=1).filter(lambda x: x.iloc[0, 0] > 10)
+ expected = data.iloc[:, 12:20]
+ tm.assert_frame_equal(result, expected)
+
+
+def test_filter_bad_shapes():
+ df = DataFrame({'A': np.arange(8),
+ 'B': list('aabbbbcc'),
+ 'C': np.arange(8)})
+ s = df['B']
+ g_df = df.groupby('B')
+ g_s = s.groupby(s)
+
+ f = lambda x: x
+ pytest.raises(TypeError, lambda: g_df.filter(f))
+ pytest.raises(TypeError, lambda: g_s.filter(f))
+
+ f = lambda x: x == 1
+ pytest.raises(TypeError, lambda: g_df.filter(f))
+ pytest.raises(TypeError, lambda: g_s.filter(f))
+
+ f = lambda x: np.outer(x, x)
+ pytest.raises(TypeError, lambda: g_df.filter(f))
+ pytest.raises(TypeError, lambda: g_s.filter(f))
+
+
+def test_filter_nan_is_false():
+ df = DataFrame({'A': np.arange(8),
+ 'B': list('aabbbbcc'),
+ 'C': np.arange(8)})
+ s = df['B']
+ g_df = df.groupby(df['B'])
+ g_s = s.groupby(s)
+
+ f = lambda x: np.nan
+ tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
+ tm.assert_series_equal(g_s.filter(f), s[[]])
+
+
+def test_filter_against_workaround():
+ np.random.seed(0)
+ # Series of ints
+ s = Series(np.random.randint(0, 100, 1000))
+ grouper = s.apply(lambda x: np.round(x, -1))
+ grouped = s.groupby(grouper)
+ f = lambda x: x.mean() > 10
+
+ old_way = s[grouped.transform(f).astype('bool')]
+ new_way = grouped.filter(f)
+ tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
+
+ # Series of floats
+ s = 100 * Series(np.random.random(1000))
+ grouper = s.apply(lambda x: np.round(x, -1))
+ grouped = s.groupby(grouper)
+ f = lambda x: x.mean() > 10
+ old_way = s[grouped.transform(f).astype('bool')]
+ new_way = grouped.filter(f)
+ tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
+
+ # Set up DataFrame of ints, floats, strings.
+ from string import ascii_lowercase
+ letters = np.array(list(ascii_lowercase))
+ N = 1000
+ random_letters = letters.take(np.random.randint(0, 26, N))
+ df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
+ 'floats': N / 10 * Series(np.random.random(N)),
+ 'letters': Series(random_letters)})
+
+ # Group by ints; filter on floats.
+ grouped = df.groupby('ints')
+ old_way = df[grouped.floats.
+ transform(lambda x: x.mean() > N / 20).astype('bool')]
+ new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
+ tm.assert_frame_equal(new_way, old_way)
+
+ # Group by floats (rounded); filter on strings.
+ grouper = df.floats.apply(lambda x: np.round(x, -1))
+ grouped = df.groupby(grouper)
+ old_way = df[grouped.letters.
+ transform(lambda x: len(x) < N / 10).astype('bool')]
+ new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
+ tm.assert_frame_equal(new_way, old_way)
+
+ # Group by strings; filter on ints.
+ grouped = df.groupby('letters')
+ old_way = df[grouped.ints.
+ transform(lambda x: x.mean() > N / 20).astype('bool')]
+ new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
+ tm.assert_frame_equal(new_way, old_way)
+
+
+def test_filter_using_len():
+ # BUG GH4447
+ df = DataFrame({'A': np.arange(8),
+ 'B': list('aabbbbcc'),
+ 'C': np.arange(8)})
+ grouped = df.groupby('B')
+ actual = grouped.filter(lambda x: len(x) > 2)
+ expected = DataFrame(
+ {'A': np.arange(2, 6),
+ 'B': list('bbbb'),
+ 'C': np.arange(2, 6)}, index=np.arange(2, 6))
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped.filter(lambda x: len(x) > 4)
+ expected = df.loc[[]]
+ tm.assert_frame_equal(actual, expected)
+
+ # Series have always worked properly, but we'll test anyway.
+ s = df['B']
+ grouped = s.groupby(s)
+ actual = grouped.filter(lambda x: len(x) > 2)
+ expected = Series(4 * ['b'], index=np.arange(2, 6), name='B')
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped.filter(lambda x: len(x) > 4)
+ expected = s[[]]
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_maintains_ordering():
+ # Simple case: index is sequential. #4621
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]})
+ s = df['pid']
+ grouped = df.groupby('tag')
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = df.iloc[[1, 2, 4, 7]]
+ tm.assert_frame_equal(actual, expected)
+
+ grouped = s.groupby(df['tag'])
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = s.iloc[[1, 2, 4, 7]]
+ tm.assert_series_equal(actual, expected)
+
+ # Now index is sequentially decreasing.
+ df.index = np.arange(len(df) - 1, -1, -1)
+ s = df['pid']
+ grouped = df.groupby('tag')
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = df.iloc[[1, 2, 4, 7]]
+ tm.assert_frame_equal(actual, expected)
+
+ grouped = s.groupby(df['tag'])
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = s.iloc[[1, 2, 4, 7]]
+ tm.assert_series_equal(actual, expected)
+
+ # Index is shuffled.
+ SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
+ df.index = df.index[SHUFFLED]
+ s = df['pid']
+ grouped = df.groupby('tag')
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = df.iloc[[1, 2, 4, 7]]
+ tm.assert_frame_equal(actual, expected)
+
+ grouped = s.groupby(df['tag'])
+ actual = grouped.filter(lambda x: len(x) > 1)
+ expected = s.iloc[[1, 2, 4, 7]]
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_multiple_timestamp():
+ # GH 10114
+ df = DataFrame({'A': np.arange(5, dtype='int64'),
+ 'B': ['foo', 'bar', 'foo', 'bar', 'bar'],
+ 'C': Timestamp('20130101')})
+
+ grouped = df.groupby(['B', 'C'])
+
+ result = grouped['A'].filter(lambda x: True)
+ tm.assert_series_equal(df['A'], result)
+
+ result = grouped['A'].transform(len)
+ expected = Series([2, 3, 2, 3, 3], name='A')
+ tm.assert_series_equal(result, expected)
+
+ result = grouped.filter(lambda x: True)
+ tm.assert_frame_equal(df, result)
+
+ result = grouped.transform('sum')
+ expected = DataFrame({'A': [2, 8, 2, 8, 8]})
+ tm.assert_frame_equal(result, expected)
+
+ result = grouped.transform(len)
+ expected = DataFrame({'A': [2, 3, 2, 3, 3]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_filter_and_transform_with_non_unique_int_index():
+ # GH4620
+ index = [1, 1, 1, 2, 1, 1, 0, 1]
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_multiple_non_unique_int_index():
+ # GH4620
+ index = [1, 1, 1, 2, 0, 0, 0, 1]
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_float_index():
+ # GH4620
+ index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_timestamp_index():
+ # GH4620
+ t0 = Timestamp('2013-09-30 00:05:00')
+ t1 = Timestamp('2013-10-30 00:05:00')
+ t2 = Timestamp('2013-11-30 00:05:00')
+ index = [t1, t1, t1, t2, t1, t1, t0, t1]
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_string_index():
+ # GH4620
+ index = list('bbbcbbab')
+ df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
+ 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
+ grouped_df = df.groupby('tag')
+ ser = df['pid']
+ grouped_ser = ser.groupby(df['tag'])
+ expected_indexes = [1, 2, 4, 7]
+
+ # Filter DataFrame
+ actual = grouped_df.filter(lambda x: len(x) > 1)
+ expected = df.iloc[expected_indexes]
+ tm.assert_frame_equal(actual, expected)
+
+ actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+ expected = df.copy()
+ expected.iloc[[0, 3, 5, 6]] = np.nan
+ tm.assert_frame_equal(actual, expected)
+
+ # Filter Series
+ actual = grouped_ser.filter(lambda x: len(x) > 1)
+ expected = ser.take(expected_indexes)
+ tm.assert_series_equal(actual, expected)
+
+ actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+ NA = np.nan
+ expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
+ # ^ made manually because this can get confusing!
+ tm.assert_series_equal(actual, expected)
+
+ # Transform Series
+ actual = grouped_ser.transform(len)
+ expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
+ tm.assert_series_equal(actual, expected)
+
+ # Transform (a column from) DataFrameGroupBy
+ actual = grouped_df.pid.transform(len)
+ tm.assert_series_equal(actual, expected)
+
+
+def test_filter_has_access_to_grouped_cols():
+ df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+ # previously didn't have access to col A #????
+ filt = g.filter(lambda x: x['A'].sum() == 2)
+ tm.assert_frame_equal(filt, df.iloc[[0, 1]])
+
+
+def test_filter_enforces_scalarness():
+ df = pd.DataFrame([
+ ['best', 'a', 'x'],
+ ['worst', 'b', 'y'],
+ ['best', 'c', 'x'],
+ ['best', 'd', 'y'],
+ ['worst', 'd', 'y'],
+ ['worst', 'd', 'y'],
+ ['best', 'd', 'z'],
+ ], columns=['a', 'b', 'c'])
+ with tm.assert_raises_regex(TypeError,
+ 'filter function returned a.*'):
+ df.groupby('c').filter(lambda g: g['a'] == 'best')
+
+
+def test_filter_non_bool_raises():
+ df = pd.DataFrame([
+ ['best', 'a', 1],
+ ['worst', 'b', 1],
+ ['best', 'c', 1],
+ ['best', 'd', 1],
+ ['worst', 'd', 1],
+ ['worst', 'd', 1],
+ ['best', 'd', 1],
+ ], columns=['a', 'b', 'c'])
+ with tm.assert_raises_regex(TypeError,
+ 'filter function returned a.*'):
+ df.groupby('a').filter(lambda g: g.c.mean())
+
+
+def test_filter_dropna_with_empty_groups():
+ # GH 10780
+ data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
+ groupped = data.groupby(level=0)
+ result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
+ expected_false = pd.Series([np.nan] * 9,
+ index=np.repeat([1, 2, 3], 3))
+ tm.assert_series_equal(result_false, expected_false)
+
+ result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
+ expected_true = pd.Series(index=pd.Index([], dtype=int))
+ tm.assert_series_equal(result_true, expected_true)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
new file mode 100644
index 0000000000000..ba1371fe9f931
--- /dev/null
+++ b/pandas/tests/groupby/test_function.py
@@ -0,0 +1,1120 @@
+import pytest
+
+import numpy as np
+import pandas as pd
+from pandas import (DataFrame, Index, compat, isna,
+ Series, MultiIndex, Timestamp, date_range)
+from pandas.errors import UnsupportedFunctionCall
+from pandas.util import testing as tm
+import pandas.core.nanops as nanops
+from string import ascii_lowercase
+from pandas.compat import product as cart_product
+
+
+@pytest.mark.parametrize("agg_func", ['any', 'all'])
+@pytest.mark.parametrize("skipna", [True, False])
+@pytest.mark.parametrize("vals", [
+ ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''],
+ [1, 2, 3], [1, 0, 0], [0, 0, 0],
+ [1., 2., 3.], [1., 0., 0.], [0., 0., 0.],
+ [True, True, True], [True, False, False], [False, False, False],
+ [np.nan, np.nan, np.nan]
+])
+def test_groupby_bool_aggs(agg_func, skipna, vals):
+ df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2})
+
+ # Figure out expectation using Python builtin
+ exp = getattr(compat.builtins, agg_func)(vals)
+
+ # edge case for missing data with skipna and 'any'
+ if skipna and all(isna(vals)) and agg_func == 'any':
+ exp = False
+
+ exp_df = DataFrame([exp] * 2, columns=['val'], index=Index(
+ ['a', 'b'], name='key'))
+ result = getattr(df.groupby('key'), agg_func)(skipna=skipna)
+ tm.assert_frame_equal(result, exp_df)
+
+
+def test_max_min_non_numeric():
+ # #2700
+ aa = DataFrame({'nn': [11, 11, 22, 22],
+ 'ii': [1, 2, 3, 4],
+ 'ss': 4 * ['mama']})
+
+ result = aa.groupby('nn').max()
+ assert 'ss' in result
+
+ result = aa.groupby('nn').max(numeric_only=False)
+ assert 'ss' in result
+
+ result = aa.groupby('nn').min()
+ assert 'ss' in result
+
+ result = aa.groupby('nn').min(numeric_only=False)
+ assert 'ss' in result
+
+
+def test_intercept_builtin_sum():
+ s = Series([1., 2., np.nan, 3.])
+ grouped = s.groupby([0, 1, 2, 2])
+
+ result = grouped.agg(compat.builtins.sum)
+ result2 = grouped.apply(compat.builtins.sum)
+ expected = grouped.sum()
+ tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result2, expected)
+
+
+def test_builtins_apply(): # GH8155
+ df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)),
+ columns=['jim', 'joe'])
+ df['jolie'] = np.random.randn(1000)
+
+ for keys in ['jim', ['jim', 'joe']]: # single key & multi-key
+ if keys == 'jim':
+ continue
+ for f in [max, min, sum]:
+ fname = f.__name__
+ result = df.groupby(keys).apply(f)
+ result.shape
+ ngroups = len(df.drop_duplicates(subset=keys))
+ assert result.shape == (ngroups, 3), 'invalid frame shape: '\
+ '{} (expected ({}, 3))'.format(result.shape, ngroups)
+
+ tm.assert_frame_equal(result, # numpy's equivalent function
+ df.groupby(keys).apply(getattr(np, fname)))
+
+ if f != sum:
+ expected = df.groupby(keys).agg(fname).reset_index()
+ expected.set_index(keys, inplace=True, drop=False)
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
+ tm.assert_series_equal(getattr(result, fname)(),
+ getattr(df, fname)())
+
+
+def test_arg_passthru():
+ # make sure that we are passing thru kwargs
+ # to our agg functions
+
+ # GH3668
+ # GH5724
+ df = pd.DataFrame(
+ {'group': [1, 1, 2],
+ 'int': [1, 2, 3],
+ 'float': [4., 5., 6.],
+ 'string': list('abc'),
+ 'category_string': pd.Series(list('abc')).astype('category'),
+ 'category_int': [7, 8, 9],
+ 'datetime': pd.date_range('20130101', periods=3),
+ 'datetimetz': pd.date_range('20130101',
+ periods=3,
+ tz='US/Eastern'),
+ 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
+ columns=['group', 'int', 'float', 'string',
+ 'category_string', 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+
+ expected_columns_numeric = Index(['int', 'float', 'category_int'])
+
+ # mean / median
+ expected = pd.DataFrame(
+ {'category_int': [7.5, 9],
+ 'float': [4.5, 6.],
+ 'timedelta': [pd.Timedelta('1.5s'),
+ pd.Timedelta('3s')],
+ 'int': [1.5, 3],
+ 'datetime': [pd.Timestamp('2013-01-01 12:00:00'),
+ pd.Timestamp('2013-01-03 00:00:00')],
+ 'datetimetz': [
+ pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'),
+ pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]},
+ index=Index([1, 2], name='group'),
+ columns=['int', 'float', 'category_int',
+ 'datetime', 'datetimetz', 'timedelta'])
+ for attr in ['mean', 'median']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+ # TODO: min, max *should* handle
+ # categorical (ordered) dtype
+ expected_columns = Index(['int', 'float', 'string',
+ 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+ for attr in ['min', 'max']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'string',
+ 'category_string', 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+ for attr in ['first', 'last']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'string',
+ 'category_int', 'timedelta'])
+ for attr in ['sum']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'category_int'])
+ for attr in ['prod', 'cumprod']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ # like min, max, but don't include strings
+ expected_columns = Index(['int', 'float',
+ 'category_int',
+ 'datetime', 'datetimetz',
+ 'timedelta'])
+ for attr in ['cummin', 'cummax']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ # GH 15561: numeric_only=False set by default like min/max
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+ expected_columns = Index(['int', 'float', 'category_int',
+ 'timedelta'])
+ for attr in ['cumsum']:
+ f = getattr(df.groupby('group'), attr)
+ result = f()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = f(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
+
+
+def test_non_cython_api():
+
+ # GH5610
+ # non-cython calls should not include the grouper
+
+ df = DataFrame(
+ [[1, 2, 'foo'],
+ [1, np.nan, 'bar'],
+ [3, np.nan, 'baz']],
+ columns=['A', 'B', 'C'])
+ g = df.groupby('A')
+ gni = df.groupby('A', as_index=False)
+
+ # mad
+ expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3])
+ expected.index.name = 'A'
+ result = g.mad()
+ tm.assert_frame_equal(result, expected)
+
+ expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'],
+ index=[0, 1])
+ result = gni.mad()
+ tm.assert_frame_equal(result, expected)
+
+ # describe
+ expected_index = pd.Index([1, 3], name='A')
+ expected_col = pd.MultiIndex(levels=[['B'],
+ ['count', 'mean', 'std', 'min',
+ '25%', '50%', '75%', 'max']],
+ labels=[[0] * 8, list(range(8))])
+ expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
+ [0.0, np.nan, np.nan, np.nan, np.nan, np.nan,
+ np.nan, np.nan]],
+ index=expected_index,
+ columns=expected_col)
+ result = g.describe()
+ tm.assert_frame_equal(result, expected)
+
+ expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
+ df[df.A == 3].describe().unstack().to_frame().T])
+ expected.index = pd.Index([0, 1])
+ result = gni.describe()
+ tm.assert_frame_equal(result, expected)
+
+ # any
+ expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'],
+ index=[1, 3])
+ expected.index.name = 'A'
+ result = g.any()
+ tm.assert_frame_equal(result, expected)
+
+ # idxmax
+ expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3])
+ expected.index.name = 'A'
+ result = g.idxmax()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_cython_api2():
+
+ # this takes the fast apply path
+
+ # cumsum (GH5614)
+ df = DataFrame(
+ [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
+ ], columns=['A', 'B', 'C'])
+ expected = DataFrame(
+ [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
+ result = df.groupby('A').cumsum()
+ tm.assert_frame_equal(result, expected)
+
+ # GH 5755 - cumsum is a transformer and should ignore as_index
+ result = df.groupby('A', as_index=False).cumsum()
+ tm.assert_frame_equal(result, expected)
+
+ # GH 13994
+ result = df.groupby('A').cumsum(axis=1)
+ expected = df.cumsum(axis=1)
+ tm.assert_frame_equal(result, expected)
+ result = df.groupby('A').cumprod(axis=1)
+ expected = df.cumprod(axis=1)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_cython_median():
+ df = DataFrame(np.random.randn(1000))
+ df.values[::2] = np.nan
+
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+ labels[::17] = np.nan
+
+ result = df.groupby(labels).median()
+ exp = df.groupby(labels).agg(nanops.nanmedian)
+ tm.assert_frame_equal(result, exp)
+
+ df = DataFrame(np.random.randn(1000, 5))
+ rs = df.groupby(labels).agg(np.median)
+ xp = df.groupby(labels).median()
+ tm.assert_frame_equal(rs, xp)
+
+
+def test_median_empty_bins():
+ df = pd.DataFrame(np.random.randint(0, 44, 500))
+
+ grps = range(0, 55, 5)
+ bins = pd.cut(df[0], grps)
+
+ result = df.groupby(bins).median()
+ expected = df.groupby(bins).agg(lambda x: x.median())
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [
+ 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
+@pytest.mark.parametrize("method,data", [
+ ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+ ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+ ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+ ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+ ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
+ 'args': [1]}),
+ ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
+ 'out_type': 'int64'})
+])
+def test_groupby_non_arithmetic_agg_types(dtype, method, data):
+ # GH9311, GH6620
+ df = pd.DataFrame(
+ [{'a': 1, 'b': 1},
+ {'a': 1, 'b': 2},
+ {'a': 2, 'b': 3},
+ {'a': 2, 'b': 4}])
+
+ df['b'] = df.b.astype(dtype)
+
+ if 'args' not in data:
+ data['args'] = []
+
+ if 'out_type' in data:
+ out_type = data['out_type']
+ else:
+ out_type = dtype
+
+ exp = data['df']
+ df_out = pd.DataFrame(exp)
+
+ df_out['b'] = df_out.b.astype(out_type)
+ df_out.set_index('a', inplace=True)
+
+ grpd = df.groupby('a')
+ t = getattr(grpd, method)(*data['args'])
+ tm.assert_frame_equal(t, df_out)
+
+
+def test_groupby_non_arithmetic_agg_intlike_precision():
+ # GH9311, GH6620
+ c = 24650000000000000
+
+ inputs = ((Timestamp('2011-01-15 12:50:28.502376'),
+ Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c))
+
+ for i in inputs:
+ df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}])
+
+ grp_exp = {'first': {'expected': i[0]},
+ 'last': {'expected': i[1]},
+ 'min': {'expected': i[0]},
+ 'max': {'expected': i[1]},
+ 'nth': {'expected': i[1],
+ 'args': [1]},
+ 'count': {'expected': 2}}
+
+ for method, data in compat.iteritems(grp_exp):
+ if 'args' not in data:
+ data['args'] = []
+
+ grpd = df.groupby('a')
+ res = getattr(grpd, method)(*data['args'])
+ assert res.iloc[0].b == data['expected']
+
+
+def test_fill_constistency():
+
+ # GH9221
+ # pass thru keyword arguments to the generated wrapper
+ # are set if the passed kw is None (only)
+ df = DataFrame(index=pd.MultiIndex.from_product(
+ [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]),
+ columns=Index(
+ ['1', '2'], name='id'))
+ df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan,
+ np.nan, 22, np.nan]
+ df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan,
+ np.nan, 44, np.nan]
+
+ expected = df.groupby(level=0, axis=0).fillna(method='ffill')
+ result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T
+ tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_cumprod():
+ # GH 4095
+ df = pd.DataFrame({'key': ['b'] * 10, 'value': 2})
+
+ actual = df.groupby('key')['value'].cumprod()
+ expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
+ expected.name = 'value'
+ tm.assert_series_equal(actual, expected)
+
+ df = pd.DataFrame({'key': ['b'] * 100, 'value': 2})
+ actual = df.groupby('key')['value'].cumprod()
+ # if overflows, groupby product casts to float
+ # while numpy passes back invalid values
+ df['value'] = df['value'].astype(float)
+ expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
+ expected.name = 'value'
+ tm.assert_series_equal(actual, expected)
+
+
+def test_ops_general():
+ ops = [('mean', np.mean),
+ ('median', np.median),
+ ('std', np.std),
+ ('var', np.var),
+ ('sum', np.sum),
+ ('prod', np.prod),
+ ('min', np.min),
+ ('max', np.max),
+ ('first', lambda x: x.iloc[0]),
+ ('last', lambda x: x.iloc[-1]),
+ ('count', np.size), ]
+ try:
+ from scipy.stats import sem
+ except ImportError:
+ pass
+ else:
+ ops.append(('sem', sem))
+ df = DataFrame(np.random.randn(1000))
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+
+ for op, targop in ops:
+ result = getattr(df.groupby(labels), op)().astype(float)
+ expected = df.groupby(labels).agg(targop)
+ try:
+ tm.assert_frame_equal(result, expected)
+ except BaseException as exc:
+ exc.args += ('operation: %s' % op, )
+ raise
+
+
+def test_max_nan_bug():
+ raw = """,Date,app,File
+-04-23,2013-04-23 00:00:00,,log080001.log
+-05-06,2013-05-06 00:00:00,,log.log
+-05-07,2013-05-07 00:00:00,OE,xlsx"""
+
+ df = pd.read_csv(compat.StringIO(raw), parse_dates=[0])
+ gb = df.groupby('Date')
+ r = gb[['File']].max()
+ e = gb['File'].max().to_frame()
+ tm.assert_frame_equal(r, e)
+ assert not r['File'].isna().any()
+
+
+def test_nlargest():
+ a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
+ b = Series(list('a' * 5 + 'b' * 5))
+ gb = a.groupby(b)
+ r = gb.nlargest(3)
+ e = Series([
+ 7, 5, 3, 10, 9, 6
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]]))
+ tm.assert_series_equal(r, e)
+
+ a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+ gb = a.groupby(b)
+ e = Series([
+ 3, 2, 1, 3, 3, 2
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]]))
+ tm.assert_series_equal(gb.nlargest(3, keep='last'), e)
+
+
+def test_nsmallest():
+ a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
+ b = Series(list('a' * 5 + 'b' * 5))
+ gb = a.groupby(b)
+ r = gb.nsmallest(3)
+ e = Series([
+ 1, 2, 3, 0, 4, 6
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
+ tm.assert_series_equal(r, e)
+
+ a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+ gb = a.groupby(b)
+ e = Series([
+ 0, 1, 1, 0, 1, 2
+ ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
+ tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)
+
+
+def test_numpy_compat():
+ # see gh-12811
+ df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]})
+ g = df.groupby('A')
+
+ msg = "numpy operations are not valid with groupby"
+
+ for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'):
+ tm.assert_raises_regex(UnsupportedFunctionCall, msg,
+ getattr(g, func), 1, 2, 3)
+ tm.assert_raises_regex(UnsupportedFunctionCall, msg,
+ getattr(g, func), foo=1)
+
+
+def test_cummin_cummax():
+ # GH 15048
+ num_types = [np.int32, np.int64, np.float32, np.float64]
+ num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
+ np.finfo(np.float32).min, np.finfo(np.float64).min]
+ num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
+ np.finfo(np.float32).max, np.finfo(np.float64).max]
+ base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
+ 'B': [3, 4, 3, 2, 2, 3, 2, 1]})
+ expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
+ expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
+
+ for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
+ df = base_df.astype(dtype)
+
+ # cummin
+ expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
+ result = df.groupby('A').cummin()
+ tm.assert_frame_equal(result, expected)
+ result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # Test cummin w/ min value for dtype
+ df.loc[[2, 6], 'B'] = min_val
+ expected.loc[[2, 3, 6, 7], 'B'] = min_val
+ result = df.groupby('A').cummin()
+ tm.assert_frame_equal(result, expected)
+ expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # cummax
+ expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
+ result = df.groupby('A').cummax()
+ tm.assert_frame_equal(result, expected)
+ result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # Test cummax w/ max value for dtype
+ df.loc[[2, 6], 'B'] = max_val
+ expected.loc[[2, 3, 6, 7], 'B'] = max_val
+ result = df.groupby('A').cummax()
+ tm.assert_frame_equal(result, expected)
+ expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+ tm.assert_frame_equal(result, expected)
+
+ # Test nan in some values
+ base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
+ expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
+ np.nan, 3, np.nan, 1]})
+ result = base_df.groupby('A').cummin()
+ tm.assert_frame_equal(result, expected)
+ expected = (base_df.groupby('A')
+ .B
+ .apply(lambda x: x.cummin())
+ .to_frame())
+ tm.assert_frame_equal(result, expected)
+
+ expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
+ np.nan, 3, np.nan, 3]})
+ result = base_df.groupby('A').cummax()
+ tm.assert_frame_equal(result, expected)
+ expected = (base_df.groupby('A')
+ .B
+ .apply(lambda x: x.cummax())
+ .to_frame())
+ tm.assert_frame_equal(result, expected)
+
+ # Test nan in entire column
+ base_df['B'] = np.nan
+ expected = pd.DataFrame({'B': [np.nan] * 8})
+ result = base_df.groupby('A').cummin()
+ tm.assert_frame_equal(expected, result)
+ result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
+ tm.assert_frame_equal(expected, result)
+ result = base_df.groupby('A').cummax()
+ tm.assert_frame_equal(expected, result)
+ result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
+ tm.assert_frame_equal(expected, result)
+
+ # GH 15561
+ df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001'])))
+ expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b')
+ for method in ['cummax', 'cummin']:
+ result = getattr(df.groupby('a')['b'], method)()
+ tm.assert_series_equal(expected, result)
+
+ # GH 15635
+ df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
+ result = df.groupby('a').b.cummax()
+ expected = pd.Series([2, 1, 2], name='b')
+ tm.assert_series_equal(result, expected)
+
+ df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
+ result = df.groupby('a').b.cummin()
+ expected = pd.Series([1, 2, 1], name='b')
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('in_vals, out_vals', [
+
+ # Basics: strictly increasing (T), strictly decreasing (F),
+ # abs val increasing (F), non-strictly increasing (T)
+ ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1],
+ [True, False, False, True]),
+
+ # Test with inf vals
+ ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
+ [True, False, True, False]),
+
+ # Test with nan vals; should always be False
+ ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
+ [False, False, False, False]),
+])
+def test_is_monotonic_increasing(in_vals, out_vals):
+ # GH 17015
+ source_dict = {
+ 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
+ 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
+ 'C': in_vals}
+ df = pd.DataFrame(source_dict)
+ result = df.groupby('B').C.is_monotonic_increasing
+ index = Index(list('abcd'), name='B')
+ expected = pd.Series(index=index, data=out_vals, name='C')
+ tm.assert_series_equal(result, expected)
+
+ # Also check result equal to manually taking x.is_monotonic_increasing.
+ expected = (
+ df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing))
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('in_vals, out_vals', [
+ # Basics: strictly decreasing (T), strictly increasing (F),
+ # abs val decreasing (F), non-strictly increasing (T)
+ ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1],
+ [True, False, False, True]),
+
+ # Test with inf vals
+ ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
+ [True, True, False, True]),
+
+ # Test with nan vals; should always be False
+ ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
+ [False, False, False, False]),
+])
+def test_is_monotonic_decreasing(in_vals, out_vals):
+ # GH 17015
+ source_dict = {
+ 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
+ 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
+ 'C': in_vals}
+
+ df = pd.DataFrame(source_dict)
+ result = df.groupby('B').C.is_monotonic_decreasing
+ index = Index(list('abcd'), name='B')
+ expected = pd.Series(index=index, data=out_vals, name='C')
+ tm.assert_series_equal(result, expected)
+
+
+# describe
+# --------------------------------
+
+def test_apply_describe_bug(mframe):
+ grouped = mframe.groupby(level='first')
+ grouped.describe() # it works!
+
+
+def test_series_describe_multikey():
+ ts = tm.makeTimeSeries()
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+ result = grouped.describe()
+ tm.assert_series_equal(result['mean'], grouped.mean(),
+ check_names=False)
+ tm.assert_series_equal(result['std'], grouped.std(), check_names=False)
+ tm.assert_series_equal(result['min'], grouped.min(), check_names=False)
+
+
+def test_series_describe_single():
+ ts = tm.makeTimeSeries()
+ grouped = ts.groupby(lambda x: x.month)
+ result = grouped.apply(lambda x: x.describe())
+ expected = grouped.describe().stack()
+ tm.assert_series_equal(result, expected)
+
+
+def test_series_index_name(df):
+ grouped = df.loc[:, ['C']].groupby(df['A'])
+ result = grouped.agg(lambda x: x.mean())
+ assert result.index.name == 'A'
+
+
+def test_frame_describe_multikey(tsframe):
+ grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
+ result = grouped.describe()
+ desc_groups = []
+ for col in tsframe:
+ group = grouped[col].describe()
+ # GH 17464 - Remove duplicate MultiIndex levels
+ group_col = pd.MultiIndex(
+ levels=[[col], group.columns],
+ labels=[[0] * len(group.columns), range(len(group.columns))])
+ group = pd.DataFrame(group.values,
+ columns=group_col,
+ index=group.index)
+ desc_groups.append(group)
+ expected = pd.concat(desc_groups, axis=1)
+ tm.assert_frame_equal(result, expected)
+
+ groupedT = tsframe.groupby({'A': 0, 'B': 0,
+ 'C': 1, 'D': 1}, axis=1)
+ result = groupedT.describe()
+ expected = tsframe.describe().T
+ expected.index = pd.MultiIndex(
+ levels=[[0, 1], expected.index],
+ labels=[[0, 0, 1, 1], range(len(expected.index))])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_frame_describe_tupleindex():
+
+ # GH 14848 - regression from 0.19.0 to 0.19.1
+ df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
+ 'y': [10, 20, 30, 40, 50] * 3,
+ 'z': [100, 200, 300, 400, 500] * 3})
+ df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
+ df2 = df1.rename(columns={'k': 'key'})
+ pytest.raises(ValueError, lambda: df1.groupby('k').describe())
+ pytest.raises(ValueError, lambda: df2.groupby('key').describe())
+
+
+def test_frame_describe_unstacked_format():
+ # GH 4792
+ prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
+ pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
+ pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
+ volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
+ pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
+ pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
+ df = pd.DataFrame({'PRICE': prices,
+ 'VOLUME': volumes})
+ result = df.groupby('PRICE').VOLUME.describe()
+ data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
+ df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
+ expected = pd.DataFrame(data,
+ index=pd.Index([24990, 25499], name='PRICE'),
+ columns=['count', 'mean', 'std', 'min',
+ '25%', '50%', '75%', 'max'])
+ tm.assert_frame_equal(result, expected)
+
+
+# nunique
+# --------------------------------
+
+@pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6),
+ (10, 100, 1000)))
+@pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2))
+def test_series_groupby_nunique(n, m, sort, dropna):
+
+ def check_nunique(df, keys, as_index=True):
+ gr = df.groupby(keys, as_index=as_index, sort=sort)
+ left = gr['julie'].nunique(dropna=dropna)
+
+ gr = df.groupby(keys, as_index=as_index, sort=sort)
+ right = gr['julie'].apply(Series.nunique, dropna=dropna)
+ if not as_index:
+ right = right.reset_index(drop=True)
+
+ tm.assert_series_equal(left, right, check_names=False)
+
+ days = date_range('2015-08-23', periods=10)
+
+ frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n),
+ 'joe': np.random.choice(days, n),
+ 'julie': np.random.randint(0, m, n)})
+
+ check_nunique(frame, ['jim'])
+ check_nunique(frame, ['jim', 'joe'])
+
+ frame.loc[1::17, 'jim'] = None
+ frame.loc[3::37, 'joe'] = None
+ frame.loc[7::19, 'julie'] = None
+ frame.loc[8::19, 'julie'] = None
+ frame.loc[9::19, 'julie'] = None
+
+ check_nunique(frame, ['jim'])
+ check_nunique(frame, ['jim', 'joe'])
+ check_nunique(frame, ['jim'], as_index=False)
+ check_nunique(frame, ['jim', 'joe'], as_index=False)
+
+
+def test_nunique():
+ df = DataFrame({
+ 'A': list('abbacc'),
+ 'B': list('abxacc'),
+ 'C': list('abbacx'),
+ })
+
+ expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
+ result = df.groupby('A', as_index=False).nunique()
+ tm.assert_frame_equal(result, expected)
+
+ # as_index
+ expected.index = list('abc')
+ expected.index.name = 'A'
+ result = df.groupby('A').nunique()
+ tm.assert_frame_equal(result, expected)
+
+ # with na
+ result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
+ tm.assert_frame_equal(result, expected)
+
+ # dropna
+ expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
+ index=list('abc'))
+ expected.index.name = 'A'
+ result = df.replace({'x': None}).groupby('A').nunique()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_nunique_with_object():
+ # GH 11077
+ data = pd.DataFrame(
+ [[100, 1, 'Alice'],
+ [200, 2, 'Bob'],
+ [300, 3, 'Charlie'],
+ [-400, 4, 'Dan'],
+ [500, 5, 'Edith']],
+ columns=['amount', 'id', 'name']
+ )
+
+ result = data.groupby(['id', 'amount'])['name'].nunique()
+ index = MultiIndex.from_arrays([data.id, data.amount])
+ expected = pd.Series([1] * 5, name='name', index=index)
+ tm.assert_series_equal(result, expected)
+
+
+def test_nunique_with_empty_series():
+ # GH 12553
+ data = pd.Series(name='name')
+ result = data.groupby(level=0).nunique()
+ expected = pd.Series(name='name', dtype='int64')
+ tm.assert_series_equal(result, expected)
+
+
+def test_nunique_with_timegrouper():
+ # GH 13453
+ test = pd.DataFrame({
+ 'time': [Timestamp('2016-06-28 09:35:35'),
+ Timestamp('2016-06-28 16:09:30'),
+ Timestamp('2016-06-28 16:46:28')],
+ 'data': ['1', '2', '3']}).set_index('time')
+ result = test.groupby(pd.Grouper(freq='h'))['data'].nunique()
+ expected = test.groupby(
+ pd.Grouper(freq='h')
+ )['data'].apply(pd.Series.nunique)
+ tm.assert_series_equal(result, expected)
+
+
+# count
+# --------------------------------
+
+def test_groupby_timedelta_cython_count():
+ df = DataFrame({'g': list('ab' * 2),
+ 'delt': np.arange(4).astype('timedelta64[ns]')})
+ expected = Series([
+ 2, 2
+ ], index=pd.Index(['a', 'b'], name='g'), name='delt')
+ result = df.groupby('g').delt.count()
+ tm.assert_series_equal(expected, result)
+
+
+def test_count():
+ n = 1 << 15
+ dr = date_range('2015-08-30', periods=n // 10, freq='T')
+
+ df = DataFrame({
+ '1st': np.random.choice(
+ list(ascii_lowercase), n),
+ '2nd': np.random.randint(0, 5, n),
+ '3rd': np.random.randn(n).round(3),
+ '4th': np.random.randint(-10, 10, n),
+ '5th': np.random.choice(dr, n),
+ '6th': np.random.randn(n).round(3),
+ '7th': np.random.randn(n).round(3),
+ '8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
+ '9th': np.random.choice(
+ list(ascii_lowercase), n)
+ })
+
+ for col in df.columns.drop(['1st', '2nd', '4th']):
+ df.loc[np.random.choice(n, n // 10), col] = np.nan
+
+ df['9th'] = df['9th'].astype('category')
+
+ for key in '1st', '2nd', ['1st', '2nd']:
+ left = df.groupby(key).count()
+ right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
+ tm.assert_frame_equal(left, right)
+
+ # GH5610
+ # count counts non-nulls
+ df = pd.DataFrame([[1, 2, 'foo'],
+ [1, np.nan, 'bar'],
+ [3, np.nan, np.nan]],
+ columns=['A', 'B', 'C'])
+
+ count_as = df.groupby('A').count()
+ count_not_as = df.groupby('A', as_index=False).count()
+
+ expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
+ index=[1, 3])
+ expected.index.name = 'A'
+ tm.assert_frame_equal(count_not_as, expected.reset_index())
+ tm.assert_frame_equal(count_as, expected)
+
+ count_B = df.groupby('A')['B'].count()
+ tm.assert_series_equal(count_B, expected['B'])
+
+
+def test_count_object():
+ df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
+ result = df.groupby('c').a.count()
+ expected = pd.Series([
+ 3, 3
+ ], index=pd.Index([2, 3], name='c'), name='a')
+ tm.assert_series_equal(result, expected)
+
+ df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
+ 'c': [2] * 3 + [3] * 3})
+ result = df.groupby('c').a.count()
+ expected = pd.Series([
+ 1, 3
+ ], index=pd.Index([2, 3], name='c'), name='a')
+ tm.assert_series_equal(result, expected)
+
+
+def test_count_cross_type():
+ # GH8169
+ vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
+ 0, 2, (100, 2))))
+
+ df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
+ df[df == 2] = np.nan
+ expected = df.groupby(['c', 'd']).count()
+
+ for t in ['float32', 'object']:
+ df['a'] = df['a'].astype(t)
+ df['b'] = df['b'].astype(t)
+ result = df.groupby(['c', 'd']).count()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_lower_int_prec_count():
+ df = DataFrame({'a': np.array(
+ [0, 1, 2, 100], np.int8),
+ 'b': np.array(
+ [1, 2, 3, 6], np.uint32),
+ 'c': np.array(
+ [4, 5, 6, 8], np.int16),
+ 'grp': list('ab' * 2)})
+ result = df.groupby('grp').count()
+ expected = DataFrame({'a': [2, 2],
+ 'b': [2, 2],
+ 'c': [2, 2]}, index=pd.Index(list('ab'),
+ name='grp'))
+ tm.assert_frame_equal(result, expected)
+
+
+def test_count_uses_size_on_exception():
+ class RaisingObjectException(Exception):
+ pass
+
+ class RaisingObject(object):
+
+ def __init__(self, msg='I will raise inside Cython'):
+ super(RaisingObject, self).__init__()
+ self.msg = msg
+
+ def __eq__(self, other):
+ # gets called in Cython to check that raising calls the method
+ raise RaisingObjectException(self.msg)
+
+ df = DataFrame({'a': [RaisingObject() for _ in range(4)],
+ 'grp': list('ab' * 2)})
+ result = df.groupby('grp').count()
+ expected = DataFrame({'a': [2, 2]}, index=pd.Index(
+ list('ab'), name='grp'))
+ tm.assert_frame_equal(result, expected)
+
+
+# size
+# --------------------------------
+
+def test_size(df):
+ grouped = df.groupby(['A', 'B'])
+ result = grouped.size()
+ for key, group in grouped:
+ assert result[key] == len(group)
+
+ grouped = df.groupby('A')
+ result = grouped.size()
+ for key, group in grouped:
+ assert result[key] == len(group)
+
+ grouped = df.groupby('B')
+ result = grouped.size()
+ for key, group in grouped:
+ assert result[key] == len(group)
+
+ df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
+ for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
+ left = df.groupby(key, sort=sort).size()
+ right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
+ tm.assert_series_equal(left, right, check_names=False)
+
+ # GH11699
+ df = DataFrame([], columns=['A', 'B'])
+ out = Series([], dtype='int64', index=Index([], name='A'))
+ tm.assert_series_equal(df.groupby('A').size(), out)
+
+
+# pipe
+# --------------------------------
+
+def test_pipe():
+ # Test the pipe method of DataFrameGroupBy.
+ # Issue #17871
+
+ random_state = np.random.RandomState(1234567890)
+
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': random_state.randn(8),
+ 'C': random_state.randn(8)})
+
+ def f(dfgb):
+ return dfgb.B.max() - dfgb.C.min().min()
+
+ def square(srs):
+ return srs ** 2
+
+ # Note that the transformations are
+ # GroupBy -> Series
+ # Series -> Series
+ # This then chains the GroupBy.pipe and the
+ # NDFrame.pipe methods
+ result = df.groupby('A').pipe(f).pipe(square)
+
+ index = Index([u'bar', u'foo'], dtype='object', name=u'A')
+ expected = pd.Series([8.99110003361, 8.17516964785], name='B',
+ index=index)
+
+ tm.assert_series_equal(expected, result)
+
+
+def test_pipe_args():
+ # Test passing args to the pipe method of DataFrameGroupBy.
+ # Issue #17871
+
+ df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
+ 'x': [1.0, 2.0, 3.0, 2.0, 5.0],
+ 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
+
+ def f(dfgb, arg1):
+ return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
+ .groupby(dfgb.grouper))
+
+ def g(dfgb, arg2):
+ return dfgb.sum() / dfgb.sum().sum() + arg2
+
+ def h(df, arg3):
+ return df.x + df.y - arg3
+
+ result = (df
+ .groupby('group')
+ .pipe(f, 0)
+ .pipe(g, 10)
+ .pipe(h, 100))
+
+ # Assert the results here
+ index = pd.Index(['A', 'B', 'C'], name='group')
+ expected = pd.Series([-79.5160891089, -78.4839108911, -80],
+ index=index)
+
+ tm.assert_series_equal(expected, result)
+
+ # test SeriesGroupby.pipe
+ ser = pd.Series([1, 1, 2, 2, 3, 3])
+ result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
+
+ expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
+
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py
deleted file mode 100644
index b9718663570bd..0000000000000
--- a/pandas/tests/groupby/test_functional.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# -*- coding: utf-8 -*-
-
-""" test function application """
-
-import pytest
-
-from string import ascii_lowercase
-from pandas import (date_range, Timestamp,
- Index, MultiIndex, DataFrame, Series)
-from pandas.util.testing import assert_frame_equal, assert_series_equal
-from pandas.compat import product as cart_product
-
-import numpy as np
-
-import pandas.util.testing as tm
-import pandas as pd
-from .common import MixIn
-
-
-# describe
-# --------------------------------
-
-class TestDescribe(MixIn):
-
- def test_apply_describe_bug(self):
- grouped = self.mframe.groupby(level='first')
- grouped.describe() # it works!
-
- def test_series_describe_multikey(self):
- ts = tm.makeTimeSeries()
- grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.describe()
- assert_series_equal(result['mean'], grouped.mean(), check_names=False)
- assert_series_equal(result['std'], grouped.std(), check_names=False)
- assert_series_equal(result['min'], grouped.min(), check_names=False)
-
- def test_series_describe_single(self):
- ts = tm.makeTimeSeries()
- grouped = ts.groupby(lambda x: x.month)
- result = grouped.apply(lambda x: x.describe())
- expected = grouped.describe().stack()
- assert_series_equal(result, expected)
-
- def test_series_index_name(self):
- grouped = self.df.loc[:, ['C']].groupby(self.df['A'])
- result = grouped.agg(lambda x: x.mean())
- assert result.index.name == 'A'
-
- def test_frame_describe_multikey(self):
- grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.describe()
- desc_groups = []
- for col in self.tsframe:
- group = grouped[col].describe()
- # GH 17464 - Remove duplicate MultiIndex levels
- group_col = pd.MultiIndex(
- levels=[[col], group.columns],
- labels=[[0] * len(group.columns), range(len(group.columns))])
- group = pd.DataFrame(group.values,
- columns=group_col,
- index=group.index)
- desc_groups.append(group)
- expected = pd.concat(desc_groups, axis=1)
- tm.assert_frame_equal(result, expected)
-
- groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
- 'C': 1, 'D': 1}, axis=1)
- result = groupedT.describe()
- expected = self.tsframe.describe().T
- expected.index = pd.MultiIndex(
- levels=[[0, 1], expected.index],
- labels=[[0, 0, 1, 1], range(len(expected.index))])
- tm.assert_frame_equal(result, expected)
-
- def test_frame_describe_tupleindex(self):
-
- # GH 14848 - regression from 0.19.0 to 0.19.1
- df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
- 'y': [10, 20, 30, 40, 50] * 3,
- 'z': [100, 200, 300, 400, 500] * 3})
- df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
- df2 = df1.rename(columns={'k': 'key'})
- pytest.raises(ValueError, lambda: df1.groupby('k').describe())
- pytest.raises(ValueError, lambda: df2.groupby('key').describe())
-
- def test_frame_describe_unstacked_format(self):
- # GH 4792
- prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
- pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
- pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
- volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
- pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
- pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
- df = pd.DataFrame({'PRICE': prices,
- 'VOLUME': volumes})
- result = df.groupby('PRICE').VOLUME.describe()
- data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
- df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
- expected = pd.DataFrame(data,
- index=pd.Index([24990, 25499], name='PRICE'),
- columns=['count', 'mean', 'std', 'min',
- '25%', '50%', '75%', 'max'])
- tm.assert_frame_equal(result, expected)
-
-
-# nunique
-# --------------------------------
-
-class TestNUnique(MixIn):
-
- def test_series_groupby_nunique(self):
-
- def check_nunique(df, keys, as_index=True):
- for sort, dropna in cart_product((False, True), repeat=2):
- gr = df.groupby(keys, as_index=as_index, sort=sort)
- left = gr['julie'].nunique(dropna=dropna)
-
- gr = df.groupby(keys, as_index=as_index, sort=sort)
- right = gr['julie'].apply(Series.nunique, dropna=dropna)
- if not as_index:
- right = right.reset_index(drop=True)
-
- assert_series_equal(left, right, check_names=False)
-
- days = date_range('2015-08-23', periods=10)
-
- for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)):
- frame = DataFrame({
- 'jim': np.random.choice(
- list(ascii_lowercase), n),
- 'joe': np.random.choice(days, n),
- 'julie': np.random.randint(0, m, n)
- })
-
- check_nunique(frame, ['jim'])
- check_nunique(frame, ['jim', 'joe'])
-
- frame.loc[1::17, 'jim'] = None
- frame.loc[3::37, 'joe'] = None
- frame.loc[7::19, 'julie'] = None
- frame.loc[8::19, 'julie'] = None
- frame.loc[9::19, 'julie'] = None
-
- check_nunique(frame, ['jim'])
- check_nunique(frame, ['jim', 'joe'])
- check_nunique(frame, ['jim'], as_index=False)
- check_nunique(frame, ['jim', 'joe'], as_index=False)
-
- def test_nunique(self):
- df = DataFrame({
- 'A': list('abbacc'),
- 'B': list('abxacc'),
- 'C': list('abbacx'),
- })
-
- expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
- result = df.groupby('A', as_index=False).nunique()
- tm.assert_frame_equal(result, expected)
-
- # as_index
- expected.index = list('abc')
- expected.index.name = 'A'
- result = df.groupby('A').nunique()
- tm.assert_frame_equal(result, expected)
-
- # with na
- result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
- tm.assert_frame_equal(result, expected)
-
- # dropna
- expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
- index=list('abc'))
- expected.index.name = 'A'
- result = df.replace({'x': None}).groupby('A').nunique()
- tm.assert_frame_equal(result, expected)
-
- def test_nunique_with_object(self):
- # GH 11077
- data = pd.DataFrame(
- [[100, 1, 'Alice'],
- [200, 2, 'Bob'],
- [300, 3, 'Charlie'],
- [-400, 4, 'Dan'],
- [500, 5, 'Edith']],
- columns=['amount', 'id', 'name']
- )
-
- result = data.groupby(['id', 'amount'])['name'].nunique()
- index = MultiIndex.from_arrays([data.id, data.amount])
- expected = pd.Series([1] * 5, name='name', index=index)
- tm.assert_series_equal(result, expected)
-
- def test_nunique_with_empty_series(self):
- # GH 12553
- data = pd.Series(name='name')
- result = data.groupby(level=0).nunique()
- expected = pd.Series(name='name', dtype='int64')
- tm.assert_series_equal(result, expected)
-
- def test_nunique_with_timegrouper(self):
- # GH 13453
- test = pd.DataFrame({
- 'time': [Timestamp('2016-06-28 09:35:35'),
- Timestamp('2016-06-28 16:09:30'),
- Timestamp('2016-06-28 16:46:28')],
- 'data': ['1', '2', '3']}).set_index('time')
- result = test.groupby(pd.Grouper(freq='h'))['data'].nunique()
- expected = test.groupby(
- pd.Grouper(freq='h')
- )['data'].apply(pd.Series.nunique)
- tm.assert_series_equal(result, expected)
-
-
-# count
-# --------------------------------
-
-class TestCount(MixIn):
-
- def test_groupby_timedelta_cython_count(self):
- df = DataFrame({'g': list('ab' * 2),
- 'delt': np.arange(4).astype('timedelta64[ns]')})
- expected = Series([
- 2, 2
- ], index=pd.Index(['a', 'b'], name='g'), name='delt')
- result = df.groupby('g').delt.count()
- tm.assert_series_equal(expected, result)
-
- def test_count(self):
- n = 1 << 15
- dr = date_range('2015-08-30', periods=n // 10, freq='T')
-
- df = DataFrame({
- '1st': np.random.choice(
- list(ascii_lowercase), n),
- '2nd': np.random.randint(0, 5, n),
- '3rd': np.random.randn(n).round(3),
- '4th': np.random.randint(-10, 10, n),
- '5th': np.random.choice(dr, n),
- '6th': np.random.randn(n).round(3),
- '7th': np.random.randn(n).round(3),
- '8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
- '9th': np.random.choice(
- list(ascii_lowercase), n)
- })
-
- for col in df.columns.drop(['1st', '2nd', '4th']):
- df.loc[np.random.choice(n, n // 10), col] = np.nan
-
- df['9th'] = df['9th'].astype('category')
-
- for key in '1st', '2nd', ['1st', '2nd']:
- left = df.groupby(key).count()
- right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
- assert_frame_equal(left, right)
-
- # GH5610
- # count counts non-nulls
- df = pd.DataFrame([[1, 2, 'foo'],
- [1, np.nan, 'bar'],
- [3, np.nan, np.nan]],
- columns=['A', 'B', 'C'])
-
- count_as = df.groupby('A').count()
- count_not_as = df.groupby('A', as_index=False).count()
-
- expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
- index=[1, 3])
- expected.index.name = 'A'
- assert_frame_equal(count_not_as, expected.reset_index())
- assert_frame_equal(count_as, expected)
-
- count_B = df.groupby('A')['B'].count()
- assert_series_equal(count_B, expected['B'])
-
- def test_count_object(self):
- df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
- result = df.groupby('c').a.count()
- expected = pd.Series([
- 3, 3
- ], index=pd.Index([2, 3], name='c'), name='a')
- tm.assert_series_equal(result, expected)
-
- df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
- 'c': [2] * 3 + [3] * 3})
- result = df.groupby('c').a.count()
- expected = pd.Series([
- 1, 3
- ], index=pd.Index([2, 3], name='c'), name='a')
- tm.assert_series_equal(result, expected)
-
- def test_count_cross_type(self): # GH8169
- vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
- 0, 2, (100, 2))))
-
- df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
- df[df == 2] = np.nan
- expected = df.groupby(['c', 'd']).count()
-
- for t in ['float32', 'object']:
- df['a'] = df['a'].astype(t)
- df['b'] = df['b'].astype(t)
- result = df.groupby(['c', 'd']).count()
- tm.assert_frame_equal(result, expected)
-
- def test_lower_int_prec_count(self):
- df = DataFrame({'a': np.array(
- [0, 1, 2, 100], np.int8),
- 'b': np.array(
- [1, 2, 3, 6], np.uint32),
- 'c': np.array(
- [4, 5, 6, 8], np.int16),
- 'grp': list('ab' * 2)})
- result = df.groupby('grp').count()
- expected = DataFrame({'a': [2, 2],
- 'b': [2, 2],
- 'c': [2, 2]}, index=pd.Index(list('ab'),
- name='grp'))
- tm.assert_frame_equal(result, expected)
-
- def test_count_uses_size_on_exception(self):
- class RaisingObjectException(Exception):
- pass
-
- class RaisingObject(object):
-
- def __init__(self, msg='I will raise inside Cython'):
- super(RaisingObject, self).__init__()
- self.msg = msg
-
- def __eq__(self, other):
- # gets called in Cython to check that raising calls the method
- raise RaisingObjectException(self.msg)
-
- df = DataFrame({'a': [RaisingObject() for _ in range(4)],
- 'grp': list('ab' * 2)})
- result = df.groupby('grp').count()
- expected = DataFrame({'a': [2, 2]}, index=pd.Index(
- list('ab'), name='grp'))
- tm.assert_frame_equal(result, expected)
-
-
-# size
-# --------------------------------
-
-class TestSize(MixIn):
-
- def test_size(self):
- grouped = self.df.groupby(['A', 'B'])
- result = grouped.size()
- for key, group in grouped:
- assert result[key] == len(group)
-
- grouped = self.df.groupby('A')
- result = grouped.size()
- for key, group in grouped:
- assert result[key] == len(group)
-
- grouped = self.df.groupby('B')
- result = grouped.size()
- for key, group in grouped:
- assert result[key] == len(group)
-
- df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
- for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
- left = df.groupby(key, sort=sort).size()
- right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
- assert_series_equal(left, right, check_names=False)
-
- # GH11699
- df = DataFrame([], columns=['A', 'B'])
- out = Series([], dtype='int64', index=Index([], name='A'))
- assert_series_equal(df.groupby('A').size(), out)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index cdb4e3072c65d..bb892f92f213e 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -5,3041 +5,1672 @@
from warnings import catch_warnings
from datetime import datetime
+from decimal import Decimal
-from pandas import (date_range, bdate_range, Timestamp,
+from pandas import (date_range, Timestamp,
Index, MultiIndex, DataFrame, Series,
- concat, Panel, DatetimeIndex, read_csv)
-from pandas.core.dtypes.missing import isna
-from pandas.errors import UnsupportedFunctionCall, PerformanceWarning
-from pandas.util.testing import (assert_frame_equal, assert_index_equal,
+ Panel, DatetimeIndex, read_csv)
+from pandas.errors import PerformanceWarning
+from pandas.util.testing import (assert_frame_equal,
assert_series_equal, assert_almost_equal)
from pandas.compat import (range, lrange, StringIO, lmap, lzip, map, zip,
- builtins, OrderedDict)
+ OrderedDict)
from pandas import compat
from collections import defaultdict
import pandas.core.common as com
import numpy as np
-import pandas.core.nanops as nanops
import pandas.util.testing as tm
import pandas as pd
-from .common import MixIn
-class TestGrouper(object):
+def test_repr():
+ # GH18203
+ result = repr(pd.Grouper(key='A', level='B'))
+ expected = "Grouper(key='A', level='B', axis=0, sort=False)"
+ assert result == expected
- def test_repr(self):
- # GH18203
- result = repr(pd.Grouper(key='A', level='B'))
- expected = "Grouper(key='A', level='B', axis=0, sort=False)"
- assert result == expected
+@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
+def test_basic(dtype):
-class TestGroupBy(MixIn):
+ data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
- def test_basic(self):
- def checkit(dtype):
- data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
+ index = np.arange(9)
+ np.random.shuffle(index)
+ data = data.reindex(index)
- index = np.arange(9)
- np.random.shuffle(index)
- data = data.reindex(index)
+ grouped = data.groupby(lambda x: x // 3)
- grouped = data.groupby(lambda x: x // 3)
+ for k, v in grouped:
+ assert len(v) == 3
- for k, v in grouped:
- assert len(v) == 3
+ agged = grouped.aggregate(np.mean)
+ assert agged[1] == 1
- agged = grouped.aggregate(np.mean)
- assert agged[1] == 1
+ assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
+ assert_series_equal(agged, grouped.mean())
+ assert_series_equal(grouped.agg(np.sum), grouped.sum())
- assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
- assert_series_equal(agged, grouped.mean())
- assert_series_equal(grouped.agg(np.sum), grouped.sum())
+ expected = grouped.apply(lambda x: x * x.sum())
+ transformed = grouped.transform(lambda x: x * x.sum())
+ assert transformed[7] == 12
+ assert_series_equal(transformed, expected)
- expected = grouped.apply(lambda x: x * x.sum())
- transformed = grouped.transform(lambda x: x * x.sum())
- assert transformed[7] == 12
- assert_series_equal(transformed, expected)
+ value_grouped = data.groupby(data)
+ assert_series_equal(value_grouped.aggregate(np.mean), agged,
+ check_index_type=False)
- value_grouped = data.groupby(data)
- assert_series_equal(value_grouped.aggregate(np.mean), agged,
- check_index_type=False)
+ # complex agg
+ agged = grouped.aggregate([np.mean, np.std])
- # complex agg
- agged = grouped.aggregate([np.mean, np.std])
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ agged = grouped.aggregate({'one': np.mean, 'two': np.std})
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- agged = grouped.aggregate({'one': np.mean, 'two': np.std})
+ group_constants = {0: 10, 1: 20, 2: 30}
+ agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
+ assert agged[1] == 21
- group_constants = {0: 10, 1: 20, 2: 30}
- agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
- assert agged[1] == 21
+ # corner cases
+ pytest.raises(Exception, grouped.aggregate, lambda x: x * 2)
- # corner cases
- pytest.raises(Exception, grouped.aggregate, lambda x: x * 2)
- for dtype in ['int64', 'int32', 'float64', 'float32']:
- checkit(dtype)
+def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
+ key = mframe.index.labels[0]
+ grouped = mframe.groupby(key)
+ result = grouped.sum()
- def test_groupby_nonobject_dtype(self):
- key = self.mframe.index.labels[0]
- grouped = self.mframe.groupby(key)
- result = grouped.sum()
+ expected = mframe.groupby(key.astype('O')).sum()
+ assert_frame_equal(result, expected)
- expected = self.mframe.groupby(key.astype('O')).sum()
- assert_frame_equal(result, expected)
+ # GH 3911, mixed frame non-conversion
+ df = df_mixed_floats.copy()
+ df['value'] = lrange(len(df))
- # GH 3911, mixed frame non-conversion
- df = self.df_mixed_floats.copy()
- df['value'] = lrange(len(df))
+ def max_value(group):
+ return group.loc[group['value'].idxmax()]
- def max_value(group):
- return group.loc[group['value'].idxmax()]
+ applied = df.groupby('A').apply(max_value)
+ result = applied.get_dtype_counts().sort_values()
+ expected = Series({'float64': 2,
+ 'int64': 1,
+ 'object': 2}).sort_values()
+ assert_series_equal(result, expected)
- applied = df.groupby('A').apply(max_value)
- result = applied.get_dtype_counts().sort_values()
- expected = Series({'float64': 2,
- 'int64': 1,
- 'object': 2}).sort_values()
- assert_series_equal(result, expected)
- def test_groupby_return_type(self):
+def test_groupby_return_type():
- # GH2893, return a reduced type
- df1 = DataFrame(
- [{"val1": 1, "val2": 20},
- {"val1": 1, "val2": 19},
- {"val1": 2, "val2": 27},
- {"val1": 2, "val2": 12}
- ])
+ # GH2893, return a reduced type
+ df1 = DataFrame(
+ [{"val1": 1, "val2": 20},
+ {"val1": 1, "val2": 19},
+ {"val1": 2, "val2": 27},
+ {"val1": 2, "val2": 12}
+ ])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
+ def func(dataf):
+ return dataf["val2"] - dataf["val2"].mean()
- result = df1.groupby("val1", squeeze=True).apply(func)
- assert isinstance(result, Series)
+ result = df1.groupby("val1", squeeze=True).apply(func)
+ assert isinstance(result, Series)
- df2 = DataFrame(
- [{"val1": 1, "val2": 20},
- {"val1": 1, "val2": 19},
- {"val1": 1, "val2": 27},
- {"val1": 1, "val2": 12}
- ])
+ df2 = DataFrame(
+ [{"val1": 1, "val2": 20},
+ {"val1": 1, "val2": 19},
+ {"val1": 1, "val2": 27},
+ {"val1": 1, "val2": 12}
+ ])
- def func(dataf):
- return dataf["val2"] - dataf["val2"].mean()
+ def func(dataf):
+ return dataf["val2"] - dataf["val2"].mean()
+
+ result = df2.groupby("val1", squeeze=True).apply(func)
+ assert isinstance(result, Series)
- result = df2.groupby("val1", squeeze=True).apply(func)
- assert isinstance(result, Series)
+ # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
+ df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
+ result = df.groupby('X', squeeze=False).count()
+ assert isinstance(result, DataFrame)
+
+ # GH5592
+ # inconcistent return type
+ df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
+ 'Pony', 'Pony'], B=Series(
+ np.arange(7), dtype='int64'), C=date_range(
+ '20130101', periods=7)))
+
+ def f(grp):
+ return grp.iloc[0]
+
+ expected = df.groupby('A').first()[['B']]
+ result = df.groupby('A').apply(f)[['B']]
+ assert_frame_equal(result, expected)
+
+ def f(grp):
+ if grp.name == 'Tiger':
+ return None
+ return grp.iloc[0]
+
+ result = df.groupby('A').apply(f)[['B']]
+ e = expected.copy()
+ e.loc['Tiger'] = np.nan
+ assert_frame_equal(result, e)
+
+ def f(grp):
+ if grp.name == 'Pony':
+ return None
+ return grp.iloc[0]
+
+ result = df.groupby('A').apply(f)[['B']]
+ e = expected.copy()
+ e.loc['Pony'] = np.nan
+ assert_frame_equal(result, e)
+
+ # 5592 revisited, with datetimes
+ def f(grp):
+ if grp.name == 'Pony':
+ return None
+ return grp.iloc[0]
+
+ result = df.groupby('A').apply(f)[['C']]
+ e = df.groupby('A').first()[['C']]
+ e.loc['Pony'] = pd.NaT
+ assert_frame_equal(result, e)
+
+ # scalar outputs
+ def f(grp):
+ if grp.name == 'Pony':
+ return None
+ return grp.iloc[0].loc['C']
+
+ result = df.groupby('A').apply(f)
+ e = df.groupby('A').first()['C'].copy()
+ e.loc['Pony'] = np.nan
+ e.name = None
+ assert_series_equal(result, e)
- # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
- df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
- result = df.groupby('X', squeeze=False).count()
- assert isinstance(result, DataFrame)
- # GH5592
- # inconcistent return type
- df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
- 'Pony', 'Pony'], B=Series(
- np.arange(7), dtype='int64'), C=date_range(
- '20130101', periods=7)))
+def test_pass_args_kwargs(ts, tsframe):
- def f(grp):
- return grp.iloc[0]
+ def f(x, q=None, axis=0):
+ return np.percentile(x, q, axis=axis)
- expected = df.groupby('A').first()[['B']]
- result = df.groupby('A').apply(f)[['B']]
- assert_frame_equal(result, expected)
+ g = lambda x: np.percentile(x, 80, axis=0)
- def f(grp):
- if grp.name == 'Tiger':
- return None
- return grp.iloc[0]
+ # Series
+ ts_grouped = ts.groupby(lambda x: x.month)
+ agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
+ apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
+ trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
+
+ agg_expected = ts_grouped.quantile(.8)
+ trans_expected = ts_grouped.transform(g)
+
+ assert_series_equal(apply_result, agg_expected)
+ assert_series_equal(agg_result, agg_expected, check_names=False)
+ assert_series_equal(trans_result, trans_expected)
+
+ agg_result = ts_grouped.agg(f, q=80)
+ apply_result = ts_grouped.apply(f, q=80)
+ trans_result = ts_grouped.transform(f, q=80)
+ assert_series_equal(agg_result, agg_expected)
+ assert_series_equal(apply_result, agg_expected)
+ assert_series_equal(trans_result, trans_expected)
+
+ # DataFrame
+ df_grouped = tsframe.groupby(lambda x: x.month)
+ agg_result = df_grouped.agg(np.percentile, 80, axis=0)
+ apply_result = df_grouped.apply(DataFrame.quantile, .8)
+ expected = df_grouped.quantile(.8)
+ assert_frame_equal(apply_result, expected)
+ assert_frame_equal(agg_result, expected, check_names=False)
+
+ agg_result = df_grouped.agg(f, q=80)
+ apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
+ assert_frame_equal(agg_result, expected, check_names=False)
+ assert_frame_equal(apply_result, expected)
+
+
+def test_len():
+ df = tm.makeTimeDataFrame()
+ grouped = df.groupby([lambda x: x.year, lambda x: x.month,
+ lambda x: x.day])
+ assert len(grouped) == len(df)
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Tiger'] = np.nan
- assert_frame_equal(result, e)
+ grouped = df.groupby([lambda x: x.year, lambda x: x.month])
+ expected = len({(x.year, x.month) for x in df.index})
+ assert len(grouped) == expected
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
+ # issue 11016
+ df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
+ assert len(df.groupby(('a'))) == 0
+ assert len(df.groupby(('b'))) == 3
+ assert len(df.groupby(['a', 'b'])) == 3
+
+
+def test_basic_regression():
+ # regression
+ T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
+ result = Series(T, lrange(0, len(T)))
- result = df.groupby('A').apply(f)[['B']]
- e = expected.copy()
- e.loc['Pony'] = np.nan
- assert_frame_equal(result, e)
+ groupings = np.random.random((1100, ))
+ groupings = Series(groupings, lrange(0, len(groupings))) * 10.
- # 5592 revisited, with datetimes
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0]
+ grouped = result.groupby(groupings)
+ grouped.mean()
- result = df.groupby('A').apply(f)[['C']]
- e = df.groupby('A').first()[['C']]
- e.loc['Pony'] = pd.NaT
- assert_frame_equal(result, e)
- # scalar outputs
- def f(grp):
- if grp.name == 'Pony':
- return None
- return grp.iloc[0].loc['C']
-
- result = df.groupby('A').apply(f)
- e = df.groupby('A').first()['C'].copy()
- e.loc['Pony'] = np.nan
- e.name = None
- assert_series_equal(result, e)
-
- def test_apply_issues(self):
- # GH 5788
-
- s = """2011.05.16,00:00,1.40893
-2011.05.16,01:00,1.40760
-2011.05.16,02:00,1.40750
-2011.05.16,03:00,1.40649
-2011.05.17,02:00,1.40893
-2011.05.17,03:00,1.40760
-2011.05.17,04:00,1.40750
-2011.05.17,05:00,1.40649
-2011.05.18,02:00,1.40893
-2011.05.18,03:00,1.40760
-2011.05.18,04:00,1.40750
-2011.05.18,05:00,1.40649"""
-
- df = pd.read_csv(
- StringIO(s), header=None, names=['date', 'time', 'value'],
- parse_dates=[['date', 'time']])
- df = df.set_index('date_time')
-
- expected = df.groupby(df.index.date).idxmax()
- result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
- assert_frame_equal(result, expected)
-
- # GH 5789
- # don't auto coerce dates
- df = pd.read_csv(
- StringIO(s), header=None, names=['date', 'time', 'value'])
- exp_idx = pd.Index(
- ['2011.05.16', '2011.05.17', '2011.05.18'
- ], dtype=object, name='date')
- expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
- result = df.groupby('date').apply(
- lambda x: x['time'][x['value'].idxmax()])
- assert_series_equal(result, expected)
-
- def test_apply_trivial(self):
- # GH 20066
- # trivial apply: ignore input and return a constant dataframe.
- df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
- 'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
- columns=['key', 'data'])
- expected = pd.concat([df.iloc[1:], df.iloc[1:]],
- axis=1, keys=['float64', 'object'])
- result = df.groupby([str(x) for x in df.dtypes],
- axis=1).apply(lambda x: df.iloc[1:])
-
- assert_frame_equal(result, expected)
-
- @pytest.mark.xfail(reason=("GH 20066; function passed into apply "
- "returns a DataFrame with the same index "
- "as the one to create GroupBy object."))
- def test_apply_trivial_fail(self):
- # GH 20066
- # trivial apply fails if the constant dataframe has the same index
- # with the one used to create GroupBy object.
- df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
- 'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
- columns=['key', 'data'])
- expected = pd.concat([df, df],
- axis=1, keys=['float64', 'object'])
- result = df.groupby([str(x) for x in df.dtypes],
- axis=1).apply(lambda x: df)
-
- assert_frame_equal(result, expected)
-
- def test_time_field_bug(self):
- # Test a fix for the following error related to GH issue 11324 When
- # non-key fields in a group-by dataframe contained time-based fields
- # that were not returned by the apply function, an exception would be
- # raised.
-
- df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
-
- def func_with_no_date(batch):
- return pd.Series({'c': 2})
-
- def func_with_date(batch):
- return pd.Series({'b': datetime(2015, 1, 1), 'c': 2})
-
- dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
- dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
- dfg_no_conversion_expected.index.name = 'a'
-
- dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
- dfg_conversion_expected = pd.DataFrame(
- {'b': datetime(2015, 1, 1),
- 'c': 2}, index=[1])
- dfg_conversion_expected.index.name = 'a'
-
- tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
- tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
-
- def test_len(self):
- df = tm.makeTimeDataFrame()
- grouped = df.groupby([lambda x: x.year, lambda x: x.month,
- lambda x: x.day])
- assert len(grouped) == len(df)
-
- grouped = df.groupby([lambda x: x.year, lambda x: x.month])
- expected = len({(x.year, x.month) for x in df.index})
- assert len(grouped) == expected
-
- # issue 11016
- df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
- assert len(df.groupby(('a'))) == 0
- assert len(df.groupby(('b'))) == 3
- assert len(df.groupby(['a', 'b'])) == 3
-
- def test_basic_regression(self):
- # regression
- T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
- result = Series(T, lrange(0, len(T)))
-
- groupings = np.random.random((1100, ))
- groupings = Series(groupings, lrange(0, len(groupings))) * 10.
-
- grouped = result.groupby(groupings)
- grouped.mean()
-
- def test_with_na_groups(self):
- index = Index(np.arange(10))
-
- for dtype in ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']:
- values = Series(np.ones(10), index, dtype=dtype)
- labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
- 'bar', 'bar', np.nan, 'foo'], index=index)
-
- # this SHOULD be an int
- grouped = values.groupby(labels)
- agged = grouped.agg(len)
- expected = Series([4, 2], index=['bar', 'foo'])
-
- assert_series_equal(agged, expected, check_dtype=False)
-
- # assert issubclass(agged.dtype.type, np.integer)
-
- # explicitly return a float from my function
- def f(x):
- return float(len(x))
-
- agged = grouped.agg(f)
- expected = Series([4, 2], index=['bar', 'foo'])
-
- assert_series_equal(agged, expected, check_dtype=False)
- assert issubclass(agged.dtype.type, np.dtype(dtype).type)
-
- def test_indices_concatenation_order(self):
-
- # GH 2808
-
- def f1(x):
- y = x[(x.b % 2) == 1] ** 2
- if y.empty:
- multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
- names=['b', 'c'])
- res = DataFrame(None, columns=['a'], index=multiindex)
- return res
- else:
- y = y.set_index(['b', 'c'])
- return y
-
- def f2(x):
- y = x[(x.b % 2) == 1] ** 2
- if y.empty:
- return DataFrame()
- else:
- y = y.set_index(['b', 'c'])
- return y
-
- def f3(x):
- y = x[(x.b % 2) == 1] ** 2
- if y.empty:
- multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
- names=['foo', 'bar'])
- res = DataFrame(None, columns=['a', 'b'], index=multiindex)
- return res
- else:
- return y
-
- df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
-
- df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
-
- # correct result
- result1 = df.groupby('a').apply(f1)
- result2 = df2.groupby('a').apply(f1)
- assert_frame_equal(result1, result2)
-
- # should fail (not the same number of levels)
- pytest.raises(AssertionError, df.groupby('a').apply, f2)
- pytest.raises(AssertionError, df2.groupby('a').apply, f2)
-
- # should fail (incorrect shape)
- pytest.raises(AssertionError, df.groupby('a').apply, f3)
- pytest.raises(AssertionError, df2.groupby('a').apply, f3)
-
- def test_attr_wrapper(self):
- grouped = self.ts.groupby(lambda x: x.weekday())
-
- result = grouped.std()
- expected = grouped.agg(lambda x: np.std(x, ddof=1))
- assert_series_equal(result, expected)
-
- # this is pretty cool
- result = grouped.describe()
- expected = {}
- for name, gp in grouped:
- expected[name] = gp.describe()
- expected = DataFrame(expected).T
- assert_frame_equal(result, expected)
-
- # get attribute
- result = grouped.dtype
- expected = grouped.agg(lambda x: x.dtype)
-
- # make sure raises error
- pytest.raises(AttributeError, getattr, grouped, 'foo')
-
- def test_frame_groupby(self):
- grouped = self.tsframe.groupby(lambda x: x.weekday())
-
- # aggregate
- aggregated = grouped.aggregate(np.mean)
- assert len(aggregated) == 5
- assert len(aggregated.columns) == 4
-
- # by string
- tscopy = self.tsframe.copy()
- tscopy['weekday'] = [x.weekday() for x in tscopy.index]
- stragged = tscopy.groupby('weekday').aggregate(np.mean)
- assert_frame_equal(stragged, aggregated, check_names=False)
-
- # transform
- grouped = self.tsframe.head(30).groupby(lambda x: x.weekday())
- transformed = grouped.transform(lambda x: x - x.mean())
- assert len(transformed) == 30
- assert len(transformed.columns) == 4
-
- # transform propagate
- transformed = grouped.transform(lambda x: x.mean())
- for name, group in grouped:
- mean = group.mean()
- for idx in group.index:
- tm.assert_series_equal(transformed.xs(idx), mean,
- check_names=False)
-
- # iterate
- for weekday, group in grouped:
- assert group.index[0].weekday() == weekday
-
- # groups / group_indices
- groups = grouped.groups
- indices = grouped.indices
-
- for k, v in compat.iteritems(groups):
- samething = self.tsframe.index.take(indices[k])
- assert (samething == v).all()
-
- def test_frame_groupby_columns(self):
- mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
- grouped = self.tsframe.groupby(mapping, axis=1)
-
- # aggregate
- aggregated = grouped.aggregate(np.mean)
- assert len(aggregated) == len(self.tsframe)
- assert len(aggregated.columns) == 2
-
- # transform
- tf = lambda x: x - x.mean()
- groupedT = self.tsframe.T.groupby(mapping, axis=0)
- assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
-
- # iterate
- for k, v in grouped:
- assert len(v.columns) == 2
-
- def test_frame_set_name_single(self):
- grouped = self.df.groupby('A')
-
- result = grouped.mean()
- assert result.index.name == 'A'
-
- result = self.df.groupby('A', as_index=False).mean()
- assert result.index.name != 'A'
-
- result = grouped.agg(np.mean)
- assert result.index.name == 'A'
-
- result = grouped.agg({'C': np.mean, 'D': np.std})
- assert result.index.name == 'A'
-
- result = grouped['C'].mean()
- assert result.index.name == 'A'
- result = grouped['C'].agg(np.mean)
- assert result.index.name == 'A'
- result = grouped['C'].agg([np.mean, np.std])
- assert result.index.name == 'A'
-
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
- assert result.index.name == 'A'
-
- def test_multi_func(self):
- col1 = self.df['A']
- col2 = self.df['B']
-
- grouped = self.df.groupby([col1.get, col2.get])
- agged = grouped.mean()
- expected = self.df.groupby(['A', 'B']).mean()
-
- # TODO groupby get drops names
- assert_frame_equal(agged.loc[:, ['C', 'D']],
- expected.loc[:, ['C', 'D']],
- check_names=False)
-
- # some "groups" with no data
- df = DataFrame({'v1': np.random.randn(6),
- 'v2': np.random.randn(6),
- 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
- 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
- index=['one', 'two', 'three', 'four', 'five', 'six'])
- # only verify that it works for now
- grouped = df.groupby(['k1', 'k2'])
- grouped.agg(np.sum)
-
- def test_multi_key_multiple_functions(self):
- grouped = self.df.groupby(['A', 'B'])['C']
-
- agged = grouped.agg([np.mean, np.std])
- expected = DataFrame({'mean': grouped.agg(np.mean),
- 'std': grouped.agg(np.std)})
- assert_frame_equal(agged, expected)
-
- def test_frame_multi_key_function_list(self):
- data = DataFrame(
- {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
- 'dull', 'shiny', 'shiny', 'shiny'],
- 'D': np.random.randn(11),
- 'E': np.random.randn(11),
- 'F': np.random.randn(11)})
-
- grouped = data.groupby(['A', 'B'])
- funcs = [np.mean, np.std]
- agged = grouped.agg(funcs)
- expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
- grouped['F'].agg(funcs)],
- keys=['D', 'E', 'F'], axis=1)
- assert (isinstance(agged.index, MultiIndex))
- assert (isinstance(expected.index, MultiIndex))
- assert_frame_equal(agged, expected)
-
- def test_groupby_multiple_columns(self):
- data = self.df
- grouped = data.groupby(['A', 'B'])
-
- def _check_op(op):
-
- with catch_warnings(record=True):
- result1 = op(grouped)
-
- expected = defaultdict(dict)
- for n1, gp1 in data.groupby('A'):
- for n2, gp2 in gp1.groupby('B'):
- expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
- expected = dict((k, DataFrame(v))
- for k, v in compat.iteritems(expected))
- expected = Panel.fromDict(expected).swapaxes(0, 1)
- expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
-
- # a little bit crude
- for col in ['C', 'D']:
- result_col = op(grouped[col])
- exp = expected[col]
- pivoted = result1[col].unstack()
- pivoted2 = result_col.unstack()
- assert_frame_equal(pivoted.reindex_like(exp), exp)
- assert_frame_equal(pivoted2.reindex_like(exp), exp)
-
- _check_op(lambda x: x.sum())
- _check_op(lambda x: x.mean())
-
- # test single series works the same
- result = data['C'].groupby([data['A'], data['B']]).mean()
- expected = data.groupby(['A', 'B']).mean()['C']
-
- assert_series_equal(result, expected)
-
- def test_groupby_as_index_agg(self):
- grouped = self.df.groupby('A', as_index=False)
-
- # single-key
-
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
-
- result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
- expected2 = grouped.mean()
- expected2['D'] = grouped.sum()['D']
- assert_frame_equal(result2, expected2)
-
- grouped = self.df.groupby('A', as_index=True)
- expected3 = grouped['C'].sum()
- expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
-
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- result3 = grouped['C'].agg({'Q': np.sum})
- assert_frame_equal(result3, expected3)
-
- # multi-key
-
- grouped = self.df.groupby(['A', 'B'], as_index=False)
-
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
-
- result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
- expected2 = grouped.mean()
- expected2['D'] = grouped.sum()['D']
- assert_frame_equal(result2, expected2)
-
- expected3 = grouped['C'].sum()
- expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
+@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
+ 'int32', 'int16', 'int8'])
+def test_with_na_groups(dtype):
+ index = Index(np.arange(10))
+ values = Series(np.ones(10), index, dtype=dtype)
+ labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
+ 'bar', 'bar', np.nan, 'foo'], index=index)
+
+ # this SHOULD be an int
+ grouped = values.groupby(labels)
+ agged = grouped.agg(len)
+ expected = Series([4, 2], index=['bar', 'foo'])
+
+ assert_series_equal(agged, expected, check_dtype=False)
+
+ # assert issubclass(agged.dtype.type, np.integer)
+
+ # explicitly return a float from my function
+ def f(x):
+ return float(len(x))
+
+ agged = grouped.agg(f)
+ expected = Series([4, 2], index=['bar', 'foo'])
+
+ assert_series_equal(agged, expected, check_dtype=False)
+ assert issubclass(agged.dtype.type, np.dtype(dtype).type)
+
+
+def test_indices_concatenation_order():
+
+ # GH 2808
+
+ def f1(x):
+ y = x[(x.b % 2) == 1] ** 2
+ if y.empty:
+ multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
+ names=['b', 'c'])
+ res = DataFrame(None, columns=['a'], index=multiindex)
+ return res
+ else:
+ y = y.set_index(['b', 'c'])
+ return y
+
+ def f2(x):
+ y = x[(x.b % 2) == 1] ** 2
+ if y.empty:
+ return DataFrame()
+ else:
+ y = y.set_index(['b', 'c'])
+ return y
+
+ def f3(x):
+ y = x[(x.b % 2) == 1] ** 2
+ if y.empty:
+ multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
+ names=['foo', 'bar'])
+ res = DataFrame(None, columns=['a', 'b'], index=multiindex)
+ return res
+ else:
+ return y
+
+ df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
+
+ df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
+
+ # correct result
+ result1 = df.groupby('a').apply(f1)
+ result2 = df2.groupby('a').apply(f1)
+ assert_frame_equal(result1, result2)
+
+ # should fail (not the same number of levels)
+ pytest.raises(AssertionError, df.groupby('a').apply, f2)
+ pytest.raises(AssertionError, df2.groupby('a').apply, f2)
+
+ # should fail (incorrect shape)
+ pytest.raises(AssertionError, df.groupby('a').apply, f3)
+ pytest.raises(AssertionError, df2.groupby('a').apply, f3)
+
+
+def test_attr_wrapper(ts):
+ grouped = ts.groupby(lambda x: x.weekday())
+
+ result = grouped.std()
+ expected = grouped.agg(lambda x: np.std(x, ddof=1))
+ assert_series_equal(result, expected)
+
+ # this is pretty cool
+ result = grouped.describe()
+ expected = {}
+ for name, gp in grouped:
+ expected[name] = gp.describe()
+ expected = DataFrame(expected).T
+ assert_frame_equal(result, expected)
+
+ # get attribute
+ result = grouped.dtype
+ expected = grouped.agg(lambda x: x.dtype)
+
+ # make sure raises error
+ pytest.raises(AttributeError, getattr, grouped, 'foo')
+
+
+def test_frame_groupby(tsframe):
+ grouped = tsframe.groupby(lambda x: x.weekday())
+
+ # aggregate
+ aggregated = grouped.aggregate(np.mean)
+ assert len(aggregated) == 5
+ assert len(aggregated.columns) == 4
+
+ # by string
+ tscopy = tsframe.copy()
+ tscopy['weekday'] = [x.weekday() for x in tscopy.index]
+ stragged = tscopy.groupby('weekday').aggregate(np.mean)
+ assert_frame_equal(stragged, aggregated, check_names=False)
+
+ # transform
+ grouped = tsframe.head(30).groupby(lambda x: x.weekday())
+ transformed = grouped.transform(lambda x: x - x.mean())
+ assert len(transformed) == 30
+ assert len(transformed.columns) == 4
+
+ # transform propagate
+ transformed = grouped.transform(lambda x: x.mean())
+ for name, group in grouped:
+ mean = group.mean()
+ for idx in group.index:
+ tm.assert_series_equal(transformed.xs(idx), mean,
+ check_names=False)
+
+ # iterate
+ for weekday, group in grouped:
+ assert group.index[0].weekday() == weekday
+
+ # groups / group_indices
+ groups = grouped.groups
+ indices = grouped.indices
+
+ for k, v in compat.iteritems(groups):
+ samething = tsframe.index.take(indices[k])
+ assert (samething == v).all()
+
+
+def test_frame_groupby_columns(tsframe):
+ mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
+ grouped = tsframe.groupby(mapping, axis=1)
+
+ # aggregate
+ aggregated = grouped.aggregate(np.mean)
+ assert len(aggregated) == len(tsframe)
+ assert len(aggregated.columns) == 2
+
+ # transform
+ tf = lambda x: x - x.mean()
+ groupedT = tsframe.T.groupby(mapping, axis=0)
+ assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
+
+ # iterate
+ for k, v in grouped:
+ assert len(v.columns) == 2
+
+
+def test_frame_set_name_single(df):
+ grouped = df.groupby('A')
+
+ result = grouped.mean()
+ assert result.index.name == 'A'
+
+ result = df.groupby('A', as_index=False).mean()
+ assert result.index.name != 'A'
+
+ result = grouped.agg(np.mean)
+ assert result.index.name == 'A'
+
+ result = grouped.agg({'C': np.mean, 'D': np.std})
+ assert result.index.name == 'A'
+
+ result = grouped['C'].mean()
+ assert result.index.name == 'A'
+ result = grouped['C'].agg(np.mean)
+ assert result.index.name == 'A'
+ result = grouped['C'].agg([np.mean, np.std])
+ assert result.index.name == 'A'
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
+ result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
+ assert result.index.name == 'A'
+
+
+def test_multi_func(df):
+ col1 = df['A']
+ col2 = df['B']
+
+ grouped = df.groupby([col1.get, col2.get])
+ agged = grouped.mean()
+ expected = df.groupby(['A', 'B']).mean()
+
+ # TODO groupby get drops names
+ assert_frame_equal(agged.loc[:, ['C', 'D']],
+ expected.loc[:, ['C', 'D']],
+ check_names=False)
+
+ # some "groups" with no data
+ df = DataFrame({'v1': np.random.randn(6),
+ 'v2': np.random.randn(6),
+ 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
+ 'k2': np.array(['1', '1', '1', '2', '2', '2'])},
+ index=['one', 'two', 'three', 'four', 'five', 'six'])
+ # only verify that it works for now
+ grouped = df.groupby(['k1', 'k2'])
+ grouped.agg(np.sum)
+
+
+def test_multi_key_multiple_functions(df):
+ grouped = df.groupby(['A', 'B'])['C']
+
+ agged = grouped.agg([np.mean, np.std])
+ expected = DataFrame({'mean': grouped.agg(np.mean),
+ 'std': grouped.agg(np.std)})
+ assert_frame_equal(agged, expected)
+
+
+def test_frame_multi_key_function_list():
+ data = DataFrame(
+ {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
+ 'dull', 'shiny', 'shiny', 'shiny'],
+ 'D': np.random.randn(11),
+ 'E': np.random.randn(11),
+ 'F': np.random.randn(11)})
+
+ grouped = data.groupby(['A', 'B'])
+ funcs = [np.mean, np.std]
+ agged = grouped.agg(funcs)
+ expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
+ grouped['F'].agg(funcs)],
+ keys=['D', 'E', 'F'], axis=1)
+ assert (isinstance(agged.index, MultiIndex))
+ assert (isinstance(expected.index, MultiIndex))
+ assert_frame_equal(agged, expected)
+
+
+@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()])
+def test_groupby_multiple_columns(df, op):
+ data = df
+ grouped = data.groupby(['A', 'B'])
+
+ with catch_warnings(record=True):
+ result1 = op(grouped)
+
+ expected = defaultdict(dict)
+ for n1, gp1 in data.groupby('A'):
+ for n2, gp2 in gp1.groupby('B'):
+ expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
+ expected = dict((k, DataFrame(v))
+ for k, v in compat.iteritems(expected))
+ expected = Panel.fromDict(expected).swapaxes(0, 1)
+ expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
+
+ # a little bit crude
+ for col in ['C', 'D']:
+ result_col = op(grouped[col])
+ exp = expected[col]
+ pivoted = result1[col].unstack()
+ pivoted2 = result_col.unstack()
+ assert_frame_equal(pivoted.reindex_like(exp), exp)
+ assert_frame_equal(pivoted2.reindex_like(exp), exp)
+
+ # test single series works the same
+ result = data['C'].groupby([data['A'], data['B']]).mean()
+ expected = data.groupby(['A', 'B']).mean()['C']
+
+ assert_series_equal(result, expected)
+
+
+def test_groupby_as_index_agg(df):
+ grouped = df.groupby('A', as_index=False)
+
+ # single-key
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+ result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
+ expected2 = grouped.mean()
+ expected2['D'] = grouped.sum()['D']
+ assert_frame_equal(result2, expected2)
+
+ grouped = df.groupby('A', as_index=True)
+ expected3 = grouped['C'].sum()
+ expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
+
+ with tm.assert_produces_warning(FutureWarning,
+ check_stacklevel=False):
result3 = grouped['C'].agg({'Q': np.sum})
- assert_frame_equal(result3, expected3)
-
- # GH7115 & GH8112 & GH8582
- df = DataFrame(np.random.randint(0, 100, (50, 3)),
- columns=['jim', 'joe', 'jolie'])
- ts = Series(np.random.randint(5, 10, 50), name='jim')
-
- gr = df.groupby(ts)
- gr.nth(0) # invokes set_selection_from_grouper internally
- assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
-
- for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
- gr = df.groupby(ts, as_index=False)
- left = getattr(gr, attr)()
-
- gr = df.groupby(ts.values, as_index=True)
- right = getattr(gr, attr)().reset_index(drop=True)
-
- assert_frame_equal(left, right)
-
- def test_as_index_series_return_frame(self):
- grouped = self.df.groupby('A', as_index=False)
- grouped2 = self.df.groupby(['A', 'B'], as_index=False)
-
- result = grouped['C'].agg(np.sum)
- expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
- assert isinstance(result, DataFrame)
- assert_frame_equal(result, expected)
-
- result2 = grouped2['C'].agg(np.sum)
- expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
- assert isinstance(result2, DataFrame)
- assert_frame_equal(result2, expected2)
-
- result = grouped['C'].sum()
- expected = grouped.sum().loc[:, ['A', 'C']]
- assert isinstance(result, DataFrame)
- assert_frame_equal(result, expected)
-
- result2 = grouped2['C'].sum()
- expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
- assert isinstance(result2, DataFrame)
- assert_frame_equal(result2, expected2)
-
- # corner case
- pytest.raises(Exception, grouped['C'].__getitem__, 'D')
-
- def test_groupby_as_index_cython(self):
- data = self.df
-
- # single-key
- grouped = data.groupby('A', as_index=False)
- result = grouped.mean()
- expected = data.groupby(['A']).mean()
- expected.insert(0, 'A', expected.index)
- expected.index = np.arange(len(expected))
- assert_frame_equal(result, expected)
-
- # multi-key
- grouped = data.groupby(['A', 'B'], as_index=False)
- result = grouped.mean()
- expected = data.groupby(['A', 'B']).mean()
-
- arrays = lzip(*expected.index.values)
- expected.insert(0, 'A', arrays[0])
- expected.insert(1, 'B', arrays[1])
- expected.index = np.arange(len(expected))
- assert_frame_equal(result, expected)
-
- def test_groupby_as_index_series_scalar(self):
- grouped = self.df.groupby(['A', 'B'], as_index=False)
-
- # GH #421
-
- result = grouped['C'].agg(len)
- expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
- assert_frame_equal(result, expected)
-
- def test_groupby_as_index_corner(self):
- pytest.raises(TypeError, self.ts.groupby, lambda x: x.weekday(),
- as_index=False)
-
- pytest.raises(ValueError, self.df.groupby, lambda x: x.lower(),
- as_index=False, axis=1)
-
- def test_groupby_as_index_apply(self):
- # GH #4648 and #3417
- df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
- 'user_id': [1, 2, 1, 1, 3, 1],
- 'time': range(6)})
-
- g_as = df.groupby('user_id', as_index=True)
- g_not_as = df.groupby('user_id', as_index=False)
-
- res_as = g_as.head(2).index
- res_not_as = g_not_as.head(2).index
- exp = Index([0, 1, 2, 4])
- assert_index_equal(res_as, exp)
- assert_index_equal(res_not_as, exp)
-
- res_as_apply = g_as.apply(lambda x: x.head(2)).index
- res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
-
- # apply doesn't maintain the original ordering
- # changed in GH5610 as the as_index=False returns a MI here
- exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
- 2, 4)])
- tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
- exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])
-
- assert_index_equal(res_as_apply, exp_as_apply)
- assert_index_equal(res_not_as_apply, exp_not_as_apply)
-
- ind = Index(list('abcde'))
- df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
- res = df.groupby(0, as_index=False).apply(lambda x: x).index
- assert_index_equal(res, ind)
-
- def test_groupby_multiple_key(self):
- df = tm.makeTimeDataFrame()
- grouped = df.groupby([lambda x: x.year, lambda x: x.month,
- lambda x: x.day])
- agged = grouped.sum()
- assert_almost_equal(df.values, agged.values)
-
- grouped = df.T.groupby([lambda x: x.year,
- lambda x: x.month,
- lambda x: x.day], axis=1)
-
- agged = grouped.agg(lambda x: x.sum())
- tm.assert_index_equal(agged.index, df.columns)
- assert_almost_equal(df.T.values, agged.values)
-
- agged = grouped.agg(lambda x: x.sum())
- assert_almost_equal(df.T.values, agged.values)
-
- def test_groupby_multi_corner(self):
- # test that having an all-NA column doesn't mess you up
- df = self.df.copy()
- df['bad'] = np.nan
- agged = df.groupby(['A', 'B']).mean()
-
- expected = self.df.groupby(['A', 'B']).mean()
- expected['bad'] = np.nan
-
- assert_frame_equal(agged, expected)
-
- def test_omit_nuisance(self):
- grouped = self.df.groupby('A')
-
- result = grouped.mean()
- expected = self.df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
- assert_frame_equal(result, expected)
-
- agged = grouped.agg(np.mean)
- exp = grouped.mean()
- assert_frame_equal(agged, exp)
-
- df = self.df.loc[:, ['A', 'C', 'D']]
- df['E'] = datetime.now()
- grouped = df.groupby('A')
- result = grouped.agg(np.sum)
- expected = grouped.sum()
- assert_frame_equal(result, expected)
-
- # won't work with axis = 1
- grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
- result = pytest.raises(TypeError, grouped.agg,
- lambda x: x.sum(0, numeric_only=False))
-
- def test_omit_nuisance_python_multiple(self):
- grouped = self.three_group.groupby(['A', 'B'])
-
- agged = grouped.agg(np.mean)
- exp = grouped.mean()
- assert_frame_equal(agged, exp)
-
- def test_empty_groups_corner(self):
- # handle empty groups
- df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
- 'k2': np.array(['1', '1', '1', '2', '2', '2']),
- 'k3': ['foo', 'bar'] * 3,
- 'v1': np.random.randn(6),
- 'v2': np.random.randn(6)})
-
- grouped = df.groupby(['k1', 'k2'])
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
-
- grouped = self.mframe[3:5].groupby(level=0)
- agged = grouped.apply(lambda x: x.mean())
- agged_A = grouped['A'].apply(np.mean)
- assert_series_equal(agged['A'], agged_A)
- assert agged.index.name == 'first'
-
- def test_apply_concat_preserve_names(self):
- grouped = self.three_group.groupby(['A', 'B'])
-
- def desc(group):
- result = group.describe()
- result.index.name = 'stat'
- return result
-
- def desc2(group):
- result = group.describe()
- result.index.name = 'stat'
- result = result[:len(group)]
- # weirdo
- return result
-
- def desc3(group):
- result = group.describe()
-
- # names are different
- result.index.name = 'stat_%d' % len(group)
-
- result = result[:len(group)]
- # weirdo
- return result
-
- result = grouped.apply(desc)
- assert result.index.names == ('A', 'B', 'stat')
-
- result2 = grouped.apply(desc2)
- assert result2.index.names == ('A', 'B', 'stat')
-
- result3 = grouped.apply(desc3)
- assert result3.index.names == ('A', 'B', None)
-
- def test_nonsense_func(self):
- df = DataFrame([0])
- pytest.raises(Exception, df.groupby, lambda x: x + 'foo')
-
- def test_builtins_apply(self): # GH8155
- df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)),
- columns=['jim', 'joe'])
- df['jolie'] = np.random.randn(1000)
-
- for keys in ['jim', ['jim', 'joe']]: # single key & multi-key
- if keys == 'jim':
- continue
- for f in [max, min, sum]:
- fname = f.__name__
- result = df.groupby(keys).apply(f)
- result.shape
- ngroups = len(df.drop_duplicates(subset=keys))
- assert result.shape == (ngroups, 3), 'invalid frame shape: '\
- '{} (expected ({}, 3))'.format(result.shape, ngroups)
-
- assert_frame_equal(result, # numpy's equivalent function
- df.groupby(keys).apply(getattr(np, fname)))
-
- if f != sum:
- expected = df.groupby(keys).agg(fname).reset_index()
- expected.set_index(keys, inplace=True, drop=False)
- assert_frame_equal(result, expected, check_dtype=False)
-
- assert_series_equal(getattr(result, fname)(),
- getattr(df, fname)())
-
- def test_max_min_non_numeric(self):
- # #2700
- aa = DataFrame({'nn': [11, 11, 22, 22],
- 'ii': [1, 2, 3, 4],
- 'ss': 4 * ['mama']})
-
- result = aa.groupby('nn').max()
- assert 'ss' in result
-
- result = aa.groupby('nn').max(numeric_only=False)
- assert 'ss' in result
-
- result = aa.groupby('nn').min()
- assert 'ss' in result
-
- result = aa.groupby('nn').min(numeric_only=False)
- assert 'ss' in result
-
- def test_arg_passthru(self):
- # make sure that we are passing thru kwargs
- # to our agg functions
-
- # GH3668
- # GH5724
- df = pd.DataFrame(
- {'group': [1, 1, 2],
- 'int': [1, 2, 3],
- 'float': [4., 5., 6.],
- 'string': list('abc'),
- 'category_string': pd.Series(list('abc')).astype('category'),
- 'category_int': [7, 8, 9],
- 'datetime': pd.date_range('20130101', periods=3),
- 'datetimetz': pd.date_range('20130101',
- periods=3,
- tz='US/Eastern'),
- 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
- columns=['group', 'int', 'float', 'string',
- 'category_string', 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
-
- expected_columns_numeric = Index(['int', 'float', 'category_int'])
-
- # mean / median
- expected = pd.DataFrame(
- {'category_int': [7.5, 9],
- 'float': [4.5, 6.],
- 'timedelta': [pd.Timedelta('1.5s'),
- pd.Timedelta('3s')],
- 'int': [1.5, 3],
- 'datetime': [pd.Timestamp('2013-01-01 12:00:00'),
- pd.Timestamp('2013-01-03 00:00:00')],
- 'datetimetz': [
- pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'),
- pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]},
- index=Index([1, 2], name='group'),
- columns=['int', 'float', 'category_int',
- 'datetime', 'datetimetz', 'timedelta'])
- for attr in ['mean', 'median']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
-
- result = f(numeric_only=False)
- assert_frame_equal(result.reindex_like(expected), expected)
-
- # TODO: min, max *should* handle
- # categorical (ordered) dtype
- expected_columns = Index(['int', 'float', 'string',
- 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- for attr in ['min', 'max']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns)
-
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
-
- expected_columns = Index(['int', 'float', 'string',
- 'category_string', 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- for attr in ['first', 'last']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns)
-
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
-
- expected_columns = Index(['int', 'float', 'string',
- 'category_int', 'timedelta'])
- for attr in ['sum']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
-
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
-
- expected_columns = Index(['int', 'float', 'category_int'])
- for attr in ['prod', 'cumprod']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
-
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
-
- # like min, max, but don't include strings
- expected_columns = Index(['int', 'float',
- 'category_int',
- 'datetime', 'datetimetz',
- 'timedelta'])
- for attr in ['cummin', 'cummax']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- # GH 15561: numeric_only=False set by default like min/max
- tm.assert_index_equal(result.columns, expected_columns)
-
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
-
- expected_columns = Index(['int', 'float', 'category_int',
- 'timedelta'])
- for attr in ['cumsum']:
- f = getattr(df.groupby('group'), attr)
- result = f()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
-
- result = f(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
-
- def test_wrap_aggregated_output_multindex(self):
- df = self.mframe.T
- df['baz', 'two'] = 'peekaboo'
-
- keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
- agged = df.groupby(keys).agg(np.mean)
- assert isinstance(agged.columns, MultiIndex)
-
- def aggfun(ser):
- if ser.name == ('foo', 'one'):
- raise TypeError
- else:
- return ser.sum()
-
- agged2 = df.groupby(keys).aggregate(aggfun)
- assert len(agged2.columns) + 1 == len(df.columns)
-
- def test_groupby_level_apply(self):
- frame = self.mframe
-
- result = frame.groupby(level=0).count()
- assert result.index.name == 'first'
- result = frame.groupby(level=1).count()
- assert result.index.name == 'second'
-
- result = frame['A'].groupby(level=0).count()
- assert result.index.name == 'first'
-
- def test_groupby_level_mapper(self):
- frame = self.mframe
- deleveled = frame.reset_index()
-
- mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
- mapper1 = {'one': 0, 'two': 0, 'three': 1}
-
- result0 = frame.groupby(mapper0, level=0).sum()
- result1 = frame.groupby(mapper1, level=1).sum()
-
- mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
- mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
- expected0 = frame.groupby(mapped_level0).sum()
- expected1 = frame.groupby(mapped_level1).sum()
- expected0.index.name, expected1.index.name = 'first', 'second'
-
- assert_frame_equal(result0, expected0)
- assert_frame_equal(result1, expected1)
-
- def test_groupby_level_nonmulti(self):
- # GH 1313, GH 13901
- s = Series([1, 2, 3, 10, 4, 5, 20, 6],
- Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
- expected = Series([11, 22, 3, 4, 5, 6],
- Index(range(1, 7), name='foo'))
-
- result = s.groupby(level=0).sum()
- tm.assert_series_equal(result, expected)
- result = s.groupby(level=[0]).sum()
- tm.assert_series_equal(result, expected)
- result = s.groupby(level=-1).sum()
- tm.assert_series_equal(result, expected)
- result = s.groupby(level=[-1]).sum()
- tm.assert_series_equal(result, expected)
-
- pytest.raises(ValueError, s.groupby, level=1)
- pytest.raises(ValueError, s.groupby, level=-2)
- pytest.raises(ValueError, s.groupby, level=[])
- pytest.raises(ValueError, s.groupby, level=[0, 0])
- pytest.raises(ValueError, s.groupby, level=[0, 1])
- pytest.raises(ValueError, s.groupby, level=[1])
-
- def test_groupby_complex(self):
- # GH 12902
- a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
- expected = Series((1 + 2j, 5 + 10j))
-
- result = a.groupby(level=0).sum()
- assert_series_equal(result, expected)
-
- result = a.sum(level=0)
- assert_series_equal(result, expected)
-
- def test_apply_series_to_frame(self):
- def f(piece):
- with np.errstate(invalid='ignore'):
- logged = np.log(piece)
- return DataFrame({'value': piece,
- 'demeaned': piece - piece.mean(),
- 'logged': logged})
-
- dr = bdate_range('1/1/2000', periods=100)
- ts = Series(np.random.randn(100), index=dr)
-
- grouped = ts.groupby(lambda x: x.month)
- result = grouped.apply(f)
-
- assert isinstance(result, DataFrame)
- tm.assert_index_equal(result.index, ts.index)
-
- def test_apply_series_yield_constant(self):
- result = self.df.groupby(['A', 'B'])['C'].apply(len)
- assert result.index.names[:2] == ('A', 'B')
-
- def test_apply_frame_yield_constant(self):
- # GH13568
- result = self.df.groupby(['A', 'B']).apply(len)
- assert isinstance(result, Series)
- assert result.name is None
-
- result = self.df.groupby(['A', 'B'])[['C', 'D']].apply(len)
- assert isinstance(result, Series)
- assert result.name is None
-
- def test_apply_frame_to_series(self):
- grouped = self.df.groupby(['A', 'B'])
- result = grouped.apply(len)
- expected = grouped.count()['C']
- tm.assert_index_equal(result.index, expected.index)
- tm.assert_numpy_array_equal(result.values, expected.values)
-
- def test_apply_frame_concat_series(self):
- def trans(group):
- return group.groupby('B')['C'].sum().sort_values()[:2]
-
- def trans2(group):
- grouped = group.groupby(df.reindex(group.index)['B'])
- return grouped.sum().sort_values()[:2]
-
- df = DataFrame({'A': np.random.randint(0, 5, 1000),
- 'B': np.random.randint(0, 5, 1000),
- 'C': np.random.randn(1000)})
-
- result = df.groupby('A').apply(trans)
- exp = df.groupby('A')['C'].apply(trans2)
- assert_series_equal(result, exp, check_names=False)
- assert result.name == 'C'
-
- def test_apply_transform(self):
- grouped = self.ts.groupby(lambda x: x.month)
- result = grouped.apply(lambda x: x * 2)
- expected = grouped.transform(lambda x: x * 2)
- assert_series_equal(result, expected)
-
- def test_apply_multikey_corner(self):
- grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
-
- def f(group):
- return group.sort_values('A')[-5:]
-
- result = grouped.apply(f)
- for key, group in grouped:
- assert_frame_equal(result.loc[key], f(group))
-
- def test_mutate_groups(self):
-
- # GH3380
-
- mydf = DataFrame({
- 'cat1': ['a'] * 8 + ['b'] * 6,
- 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
- ['d'] * 2 + ['e'] * 2,
- 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
- 'val': np.random.randint(100, size=14),
- })
-
- def f_copy(x):
- x = x.copy()
- x['rank'] = x.val.rank(method='min')
- return x.groupby('cat2')['rank'].min()
-
- def f_no_copy(x):
- x['rank'] = x.val.rank(method='min')
- return x.groupby('cat2')['rank'].min()
-
- grpby_copy = mydf.groupby('cat1').apply(f_copy)
- grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy)
- assert_series_equal(grpby_copy, grpby_no_copy)
-
- def test_no_mutate_but_looks_like(self):
-
- # GH 8467
- # first show's mutation indicator
- # second does not, but should yield the same results
- df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
-
- result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
- result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
- assert_series_equal(result1, result2)
-
- def test_apply_chunk_view(self):
- # Low level tinkering could be unsafe, make sure not
- df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
- 'value': lrange(9)})
-
- # return view
- f = lambda x: x[:2]
-
- result = df.groupby('key', group_keys=False).apply(f)
- expected = df.take([0, 1, 3, 4, 6, 7])
- assert_frame_equal(result, expected)
-
- def test_apply_no_name_column_conflict(self):
- df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
- 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
- 'value': lrange(10)[::-1]})
-
- # it works! #2605
- grouped = df.groupby(['name', 'name2'])
- grouped.apply(lambda x: x.sort_values('value', inplace=True))
-
- def test_groupby_series_indexed_differently(self):
- s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
- index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
- s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
- index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
-
- grouped = s1.groupby(s2)
- agged = grouped.mean()
- exp = s1.groupby(s2.reindex(s1.index).get).mean()
- assert_series_equal(agged, exp)
-
- def test_groupby_with_hier_columns(self):
- tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
- 'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
- 'one', 'two']]))
- index = MultiIndex.from_tuples(tuples)
- columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
- 'B', 'cat'), ('A', 'dog')])
- df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
-
- result = df.groupby(level=0).mean()
- tm.assert_index_equal(result.columns, columns)
-
- result = df.groupby(level=0, axis=1).mean()
- tm.assert_index_equal(result.index, df.index)
-
- result = df.groupby(level=0).agg(np.mean)
- tm.assert_index_equal(result.columns, columns)
-
- result = df.groupby(level=0).apply(lambda x: x.mean())
- tm.assert_index_equal(result.columns, columns)
-
- result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
- tm.assert_index_equal(result.columns, Index(['A', 'B']))
- tm.assert_index_equal(result.index, df.index)
-
- # add a nuisance column
- sorted_columns, _ = columns.sortlevel(0)
- df['A', 'foo'] = 'bar'
- result = df.groupby(level=0).mean()
- tm.assert_index_equal(result.columns, df.columns[:-1])
-
- def test_pass_args_kwargs(self):
- from numpy import percentile
-
- def f(x, q=None, axis=0):
- return percentile(x, q, axis=axis)
-
- g = lambda x: percentile(x, 80, axis=0)
-
- # Series
- ts_grouped = self.ts.groupby(lambda x: x.month)
- agg_result = ts_grouped.agg(percentile, 80, axis=0)
- apply_result = ts_grouped.apply(percentile, 80, axis=0)
- trans_result = ts_grouped.transform(percentile, 80, axis=0)
-
- agg_expected = ts_grouped.quantile(.8)
- trans_expected = ts_grouped.transform(g)
-
- assert_series_equal(apply_result, agg_expected)
- assert_series_equal(agg_result, agg_expected, check_names=False)
- assert_series_equal(trans_result, trans_expected)
-
- agg_result = ts_grouped.agg(f, q=80)
- apply_result = ts_grouped.apply(f, q=80)
- trans_result = ts_grouped.transform(f, q=80)
- assert_series_equal(agg_result, agg_expected)
- assert_series_equal(apply_result, agg_expected)
- assert_series_equal(trans_result, trans_expected)
-
- # DataFrame
- df_grouped = self.tsframe.groupby(lambda x: x.month)
- agg_result = df_grouped.agg(percentile, 80, axis=0)
- apply_result = df_grouped.apply(DataFrame.quantile, .8)
- expected = df_grouped.quantile(.8)
- assert_frame_equal(apply_result, expected)
- assert_frame_equal(agg_result, expected, check_names=False)
-
- agg_result = df_grouped.agg(f, q=80)
- apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
- assert_frame_equal(agg_result, expected, check_names=False)
- assert_frame_equal(apply_result, expected)
-
- def test_non_cython_api(self):
-
- # GH5610
- # non-cython calls should not include the grouper
-
- df = DataFrame(
- [[1, 2, 'foo'],
- [1, np.nan, 'bar'],
- [3, np.nan, 'baz']],
- columns=['A', 'B', 'C'])
- g = df.groupby('A')
- gni = df.groupby('A', as_index=False)
-
- # mad
- expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3])
- expected.index.name = 'A'
- result = g.mad()
- assert_frame_equal(result, expected)
-
- expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'],
- index=[0, 1])
- result = gni.mad()
- assert_frame_equal(result, expected)
-
- # describe
- expected_index = pd.Index([1, 3], name='A')
- expected_col = pd.MultiIndex(levels=[['B'],
- ['count', 'mean', 'std', 'min',
- '25%', '50%', '75%', 'max']],
- labels=[[0] * 8, list(range(8))])
- expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
- [0.0, np.nan, np.nan, np.nan, np.nan, np.nan,
- np.nan, np.nan]],
- index=expected_index,
- columns=expected_col)
- result = g.describe()
- assert_frame_equal(result, expected)
-
- expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
- df[df.A == 3].describe().unstack().to_frame().T])
- expected.index = pd.Index([0, 1])
- result = gni.describe()
- assert_frame_equal(result, expected)
-
- # any
- expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'],
- index=[1, 3])
- expected.index.name = 'A'
- result = g.any()
- assert_frame_equal(result, expected)
-
- # idxmax
- expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3])
- expected.index.name = 'A'
- result = g.idxmax()
- assert_frame_equal(result, expected)
-
- def test_cython_api2(self):
-
- # this takes the fast apply path
-
- # cumsum (GH5614)
- df = DataFrame(
- [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
- ], columns=['A', 'B', 'C'])
- expected = DataFrame(
- [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
- result = df.groupby('A').cumsum()
- assert_frame_equal(result, expected)
-
- # GH 5755 - cumsum is a transformer and should ignore as_index
- result = df.groupby('A', as_index=False).cumsum()
- assert_frame_equal(result, expected)
-
- # GH 13994
- result = df.groupby('A').cumsum(axis=1)
- expected = df.cumsum(axis=1)
- assert_frame_equal(result, expected)
- result = df.groupby('A').cumprod(axis=1)
- expected = df.cumprod(axis=1)
- assert_frame_equal(result, expected)
-
- def test_grouping_ndarray(self):
- grouped = self.df.groupby(self.df['A'].values)
-
- result = grouped.sum()
- expected = self.df.groupby('A').sum()
- assert_frame_equal(result, expected, check_names=False
- ) # Note: no names when grouping by value
-
- def test_apply_typecast_fail(self):
- df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
- 'c': np.tile(
- ['a', 'b', 'c'], 2),
- 'v': np.arange(1., 7.)})
-
- def f(group):
- v = group['v']
- group['v2'] = (v - v.min()) / (v.max() - v.min())
- return group
-
- result = df.groupby('d').apply(f)
-
- expected = df.copy()
- expected['v2'] = np.tile([0., 0.5, 1], 2)
-
- assert_frame_equal(result, expected)
-
- def test_apply_multiindex_fail(self):
- index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
- ])
- df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
- 'c': np.tile(['a', 'b', 'c'], 2),
- 'v': np.arange(1., 7.)}, index=index)
-
- def f(group):
- v = group['v']
- group['v2'] = (v - v.min()) / (v.max() - v.min())
- return group
-
- result = df.groupby('d').apply(f)
-
- expected = df.copy()
- expected['v2'] = np.tile([0., 0.5, 1], 2)
-
- assert_frame_equal(result, expected)
-
- def test_apply_corner(self):
- result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
- expected = self.tsframe * 2
- assert_frame_equal(result, expected)
-
- def test_apply_without_copy(self):
- # GH 5545
- # returning a non-copy in an applied function fails
-
- data = DataFrame({'id_field': [100, 100, 200, 300],
- 'category': ['a', 'b', 'c', 'c'],
- 'value': [1, 2, 3, 4]})
-
- def filt1(x):
- if x.shape[0] == 1:
- return x.copy()
- else:
- return x[x.category == 'c']
-
- def filt2(x):
- if x.shape[0] == 1:
- return x
- else:
- return x[x.category == 'c']
-
- expected = data.groupby('id_field').apply(filt1)
- result = data.groupby('id_field').apply(filt2)
- assert_frame_equal(result, expected)
-
- def test_apply_corner_cases(self):
- # #535, can't use sliding iterator
-
- N = 1000
- labels = np.random.randint(0, 100, size=N)
- df = DataFrame({'key': labels,
- 'value1': np.random.randn(N),
- 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
-
- grouped = df.groupby('key')
-
- def f(g):
- g['value3'] = g['value1'] * 2
- return g
-
- result = grouped.apply(f)
- assert 'value3' in result
+ assert_frame_equal(result3, expected3)
+
+ # multi-key
+
+ grouped = df.groupby(['A', 'B'], as_index=False)
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+ result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
+ expected2 = grouped.mean()
+ expected2['D'] = grouped.sum()['D']
+ assert_frame_equal(result2, expected2)
+
+ expected3 = grouped['C'].sum()
+ expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
+ result3 = grouped['C'].agg({'Q': np.sum})
+ assert_frame_equal(result3, expected3)
+
+ # GH7115 & GH8112 & GH8582
+ df = DataFrame(np.random.randint(0, 100, (50, 3)),
+ columns=['jim', 'joe', 'jolie'])
+ ts = Series(np.random.randint(5, 10, 50), name='jim')
+
+ gr = df.groupby(ts)
+ gr.nth(0) # invokes set_selection_from_grouper internally
+ assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
+
+ for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
+ gr = df.groupby(ts, as_index=False)
+ left = getattr(gr, attr)()
+
+ gr = df.groupby(ts.values, as_index=True)
+ right = getattr(gr, attr)().reset_index(drop=True)
+
+ assert_frame_equal(left, right)
+
+
+def test_as_index_series_return_frame(df):
+ grouped = df.groupby('A', as_index=False)
+ grouped2 = df.groupby(['A', 'B'], as_index=False)
+
+ result = grouped['C'].agg(np.sum)
+ expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
+ assert isinstance(result, DataFrame)
+ assert_frame_equal(result, expected)
+
+ result2 = grouped2['C'].agg(np.sum)
+ expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
+ assert isinstance(result2, DataFrame)
+ assert_frame_equal(result2, expected2)
+
+ result = grouped['C'].sum()
+ expected = grouped.sum().loc[:, ['A', 'C']]
+ assert isinstance(result, DataFrame)
+ assert_frame_equal(result, expected)
+
+ result2 = grouped2['C'].sum()
+ expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
+ assert isinstance(result2, DataFrame)
+ assert_frame_equal(result2, expected2)
+
+ # corner case
+ pytest.raises(Exception, grouped['C'].__getitem__, 'D')
+
+
+def test_groupby_as_index_cython(df):
+ data = df
+
+ # single-key
+ grouped = data.groupby('A', as_index=False)
+ result = grouped.mean()
+ expected = data.groupby(['A']).mean()
+ expected.insert(0, 'A', expected.index)
+ expected.index = np.arange(len(expected))
+ assert_frame_equal(result, expected)
+
+ # multi-key
+ grouped = data.groupby(['A', 'B'], as_index=False)
+ result = grouped.mean()
+ expected = data.groupby(['A', 'B']).mean()
+
+ arrays = lzip(*expected.index.values)
+ expected.insert(0, 'A', arrays[0])
+ expected.insert(1, 'B', arrays[1])
+ expected.index = np.arange(len(expected))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_as_index_series_scalar(df):
+ grouped = df.groupby(['A', 'B'], as_index=False)
+
+ # GH #421
+
+ result = grouped['C'].agg(len)
+ expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_as_index_corner(df, ts):
+ pytest.raises(TypeError, ts.groupby, lambda x: x.weekday(),
+ as_index=False)
+
+ pytest.raises(ValueError, df.groupby, lambda x: x.lower(),
+ as_index=False, axis=1)
+
+
+def test_groupby_multiple_key(df):
+ df = tm.makeTimeDataFrame()
+ grouped = df.groupby([lambda x: x.year, lambda x: x.month,
+ lambda x: x.day])
+ agged = grouped.sum()
+ assert_almost_equal(df.values, agged.values)
+
+ grouped = df.T.groupby([lambda x: x.year,
+ lambda x: x.month,
+ lambda x: x.day], axis=1)
+
+ agged = grouped.agg(lambda x: x.sum())
+ tm.assert_index_equal(agged.index, df.columns)
+ assert_almost_equal(df.T.values, agged.values)
+
+ agged = grouped.agg(lambda x: x.sum())
+ assert_almost_equal(df.T.values, agged.values)
+
+
+def test_groupby_multi_corner(df):
+ # test that having an all-NA column doesn't mess you up
+ df = df.copy()
+ df['bad'] = np.nan
+ agged = df.groupby(['A', 'B']).mean()
+
+ expected = df.groupby(['A', 'B']).mean()
+ expected['bad'] = np.nan
+
+ assert_frame_equal(agged, expected)
+
+
+def test_omit_nuisance(df):
+ grouped = df.groupby('A')
+
+ result = grouped.mean()
+ expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
+ assert_frame_equal(result, expected)
+
+ agged = grouped.agg(np.mean)
+ exp = grouped.mean()
+ assert_frame_equal(agged, exp)
+
+ df = df.loc[:, ['A', 'C', 'D']]
+ df['E'] = datetime.now()
+ grouped = df.groupby('A')
+ result = grouped.agg(np.sum)
+ expected = grouped.sum()
+ assert_frame_equal(result, expected)
+
+ # won't work with axis = 1
+ grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
+ result = pytest.raises(TypeError, grouped.agg,
+ lambda x: x.sum(0, numeric_only=False))
+
+
+def test_omit_nuisance_python_multiple(three_group):
+ grouped = three_group.groupby(['A', 'B'])
+
+ agged = grouped.agg(np.mean)
+ exp = grouped.mean()
+ assert_frame_equal(agged, exp)
+
+
+def test_empty_groups_corner(mframe):
+ # handle empty groups
+ df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
+ 'k2': np.array(['1', '1', '1', '2', '2', '2']),
+ 'k3': ['foo', 'bar'] * 3,
+ 'v1': np.random.randn(6),
+ 'v2': np.random.randn(6)})
+
+ grouped = df.groupby(['k1', 'k2'])
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+ grouped = mframe[3:5].groupby(level=0)
+ agged = grouped.apply(lambda x: x.mean())
+ agged_A = grouped['A'].apply(np.mean)
+ assert_series_equal(agged['A'], agged_A)
+ assert agged.index.name == 'first'
+
+
+def test_nonsense_func():
+ df = DataFrame([0])
+ pytest.raises(Exception, df.groupby, lambda x: x + 'foo')
+
+
+def test_wrap_aggregated_output_multindex(mframe):
+ df = mframe.T
+ df['baz', 'two'] = 'peekaboo'
+
+ keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
+ agged = df.groupby(keys).agg(np.mean)
+ assert isinstance(agged.columns, MultiIndex)
+
+ def aggfun(ser):
+ if ser.name == ('foo', 'one'):
+ raise TypeError
+ else:
+ return ser.sum()
+
+ agged2 = df.groupby(keys).aggregate(aggfun)
+ assert len(agged2.columns) + 1 == len(df.columns)
+
+
+def test_groupby_level_apply(mframe):
+
+ result = mframe.groupby(level=0).count()
+ assert result.index.name == 'first'
+ result = mframe.groupby(level=1).count()
+ assert result.index.name == 'second'
+
+ result = mframe['A'].groupby(level=0).count()
+ assert result.index.name == 'first'
+
+
+def test_groupby_level_mapper(mframe):
+ deleveled = mframe.reset_index()
+
+ mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
+ mapper1 = {'one': 0, 'two': 0, 'three': 1}
+
+ result0 = mframe.groupby(mapper0, level=0).sum()
+ result1 = mframe.groupby(mapper1, level=1).sum()
+
+ mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
+ mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
+ expected0 = mframe.groupby(mapped_level0).sum()
+ expected1 = mframe.groupby(mapped_level1).sum()
+ expected0.index.name, expected1.index.name = 'first', 'second'
+
+ assert_frame_equal(result0, expected0)
+ assert_frame_equal(result1, expected1)
+
+
+def test_groupby_level_nonmulti():
+ # GH 1313, GH 13901
+ s = Series([1, 2, 3, 10, 4, 5, 20, 6],
+ Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
+ expected = Series([11, 22, 3, 4, 5, 6],
+ Index(range(1, 7), name='foo'))
+
+ result = s.groupby(level=0).sum()
+ tm.assert_series_equal(result, expected)
+ result = s.groupby(level=[0]).sum()
+ tm.assert_series_equal(result, expected)
+ result = s.groupby(level=-1).sum()
+ tm.assert_series_equal(result, expected)
+ result = s.groupby(level=[-1]).sum()
+ tm.assert_series_equal(result, expected)
+
+ pytest.raises(ValueError, s.groupby, level=1)
+ pytest.raises(ValueError, s.groupby, level=-2)
+ pytest.raises(ValueError, s.groupby, level=[])
+ pytest.raises(ValueError, s.groupby, level=[0, 0])
+ pytest.raises(ValueError, s.groupby, level=[0, 1])
+ pytest.raises(ValueError, s.groupby, level=[1])
+
+
+def test_groupby_complex():
+ # GH 12902
+ a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
+ expected = Series((1 + 2j, 5 + 10j))
+
+ result = a.groupby(level=0).sum()
+ assert_series_equal(result, expected)
+
+ result = a.sum(level=0)
+ assert_series_equal(result, expected)
+
+
+def test_mutate_groups():
+
+ # GH3380
+
+ df = DataFrame({
+ 'cat1': ['a'] * 8 + ['b'] * 6,
+ 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
+ ['d'] * 2 + ['e'] * 2,
+ 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
+ 'val': np.random.randint(100, size=14),
+ })
+
+ def f_copy(x):
+ x = x.copy()
+ x['rank'] = x.val.rank(method='min')
+ return x.groupby('cat2')['rank'].min()
+
+ def f_no_copy(x):
+ x['rank'] = x.val.rank(method='min')
+ return x.groupby('cat2')['rank'].min()
+
+ grpby_copy = df.groupby('cat1').apply(f_copy)
+ grpby_no_copy = df.groupby('cat1').apply(f_no_copy)
+ assert_series_equal(grpby_copy, grpby_no_copy)
+
+
+def test_no_mutate_but_looks_like():
+
+ # GH 8467
+ # first show's mutation indicator
+ # second does not, but should yield the same results
+ df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
+
+ result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
+ result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
+ assert_series_equal(result1, result2)
+
+
+def test_groupby_series_indexed_differently():
+ s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
+ index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
+ s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
+ index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
+
+ grouped = s1.groupby(s2)
+ agged = grouped.mean()
+ exp = s1.groupby(s2.reindex(s1.index).get).mean()
+ assert_series_equal(agged, exp)
+
+
+def test_groupby_with_hier_columns():
+ tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
+ 'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
+ 'one', 'two']]))
+ index = MultiIndex.from_tuples(tuples)
+ columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
+ 'B', 'cat'), ('A', 'dog')])
+ df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
+
+ result = df.groupby(level=0).mean()
+ tm.assert_index_equal(result.columns, columns)
+
+ result = df.groupby(level=0, axis=1).mean()
+ tm.assert_index_equal(result.index, df.index)
+
+ result = df.groupby(level=0).agg(np.mean)
+ tm.assert_index_equal(result.columns, columns)
+
+ result = df.groupby(level=0).apply(lambda x: x.mean())
+ tm.assert_index_equal(result.columns, columns)
+
+ result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
+ tm.assert_index_equal(result.columns, Index(['A', 'B']))
+ tm.assert_index_equal(result.index, df.index)
+
+ # add a nuisance column
+ sorted_columns, _ = columns.sortlevel(0)
+ df['A', 'foo'] = 'bar'
+ result = df.groupby(level=0).mean()
+ tm.assert_index_equal(result.columns, df.columns[:-1])
- def test_groupby_wrong_multi_labels(self):
- data = """index,foo,bar,baz,spam,data
+
+def test_grouping_ndarray(df):
+ grouped = df.groupby(df['A'].values)
+
+ result = grouped.sum()
+ expected = df.groupby('A').sum()
+ assert_frame_equal(result, expected, check_names=False
+ ) # Note: no names when grouping by value
+
+
+def test_groupby_wrong_multi_labels():
+ data = """index,foo,bar,baz,spam,data
0,foo1,bar1,baz1,spam2,20
1,foo1,bar2,baz1,spam3,30
2,foo2,bar2,baz1,spam2,40
3,foo1,bar1,baz2,spam1,50
4,foo3,bar1,baz2,spam1,60"""
- data = read_csv(StringIO(data), index_col=0)
-
- grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
-
- result = grouped.agg(np.mean)
- expected = grouped.mean()
- assert_frame_equal(result, expected)
-
- def test_groupby_series_with_name(self):
- result = self.df.groupby(self.df['A']).mean()
- result2 = self.df.groupby(self.df['A'], as_index=False).mean()
- assert result.index.name == 'A'
- assert 'A' in result2
-
- result = self.df.groupby([self.df['A'], self.df['B']]).mean()
- result2 = self.df.groupby([self.df['A'], self.df['B']],
- as_index=False).mean()
- assert result.index.names == ('A', 'B')
- assert 'A' in result2
- assert 'B' in result2
-
- def test_seriesgroupby_name_attr(self):
- # GH 6265
- result = self.df.groupby('A')['C']
- assert result.count().name == 'C'
- assert result.mean().name == 'C'
-
- testFunc = lambda x: np.sum(x) * 2
- assert result.agg(testFunc).name == 'C'
-
- def test_consistency_name(self):
- # GH 12363
-
- df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B': ['one', 'one', 'two', 'two',
- 'two', 'two', 'one', 'two'],
- 'C': np.random.randn(8) + 1.0,
- 'D': np.arange(8)})
-
- expected = df.groupby(['A']).B.count()
- result = df.B.groupby(df.A).count()
- assert_series_equal(result, expected)
-
- def test_groupby_name_propagation(self):
- # GH 6124
- def summarize(df, name=None):
- return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
-
- def summarize_random_name(df):
- # Provide a different name for each Series. In this case, groupby
- # should not attempt to propagate the Series name since they are
- # inconsistent.
- return Series({
- 'count': 1,
- 'mean': 2,
- 'omissions': 3,
- }, name=df.iloc[0]['A'])
-
- metrics = self.df.groupby('A').apply(summarize)
- assert metrics.columns.name is None
- metrics = self.df.groupby('A').apply(summarize, 'metrics')
- assert metrics.columns.name == 'metrics'
- metrics = self.df.groupby('A').apply(summarize_random_name)
- assert metrics.columns.name is None
-
- def test_groupby_nonstring_columns(self):
- df = DataFrame([np.arange(10) for x in range(10)])
- grouped = df.groupby(0)
- result = grouped.mean()
- expected = df.groupby(df[0]).mean()
- assert_frame_equal(result, expected)
-
- def test_groupby_mixed_type_columns(self):
- # GH 13432, unorderable types in py3
- df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
- expected = DataFrame([[1, 2]], columns=['B', 0],
- index=Index([0], name='A'))
-
- result = df.groupby('A').first()
- tm.assert_frame_equal(result, expected)
-
- result = df.groupby('A').sum()
- tm.assert_frame_equal(result, expected)
-
- def test_cython_grouper_series_bug_noncontig(self):
- arr = np.empty((100, 100))
- arr.fill(np.nan)
- obj = Series(arr[:, 0], index=lrange(100))
- inds = np.tile(lrange(10), 10)
-
- result = obj.groupby(inds).agg(Series.median)
- assert result.isna().all()
-
- def test_series_grouper_noncontig_index(self):
- index = Index(tm.rands_array(10, 100))
-
- values = Series(np.random.randn(50), index=index[::2])
- labels = np.random.randint(0, 5, 50)
-
- # it works!
- grouped = values.groupby(labels)
-
- # accessing the index elements causes segfault
- f = lambda x: len(set(map(id, x.index)))
- grouped.agg(f)
-
- def test_convert_objects_leave_decimal_alone(self):
-
- from decimal import Decimal
-
- s = Series(lrange(5))
- labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
-
- def convert_fast(x):
- return Decimal(str(x.mean()))
-
- def convert_force_pure(x):
- # base will be length 0
- assert (len(x.base) > 0)
- return Decimal(str(x.mean()))
-
- grouped = s.groupby(labels)
-
- result = grouped.agg(convert_fast)
- assert result.dtype == np.object_
- assert isinstance(result[0], Decimal)
-
- result = grouped.agg(convert_force_pure)
- assert result.dtype == np.object_
- assert isinstance(result[0], Decimal)
-
- def test_fast_apply(self):
- # make sure that fast apply is correctly called
- # rather than raising any kind of error
- # otherwise the python path will be callsed
- # which slows things down
- N = 1000
- labels = np.random.randint(0, 2000, size=N)
- labels2 = np.random.randint(0, 3, size=N)
- df = DataFrame({'key': labels,
- 'key2': labels2,
- 'value1': np.random.randn(N),
- 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
-
- def f(g):
- return 1
-
- g = df.groupby(['key', 'key2'])
-
- grouper = g.grouper
-
- splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
- group_keys = grouper._get_group_keys()
-
- values, mutated = splitter.fast_apply(f, group_keys)
- assert not mutated
-
- def test_apply_with_mixed_dtype(self):
- # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
- df = DataFrame({'foo1': np.random.randn(6),
- 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']})
- result = df.apply(lambda x: x, axis=1)
- assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())
-
- # GH 3610 incorrect dtype conversion with as_index=False
- df = DataFrame({"c1": [1, 2, 6, 6, 8]})
- df["c2"] = df.c1 / 2.0
- result1 = df.groupby("c2").mean().reset_index().c2
- result2 = df.groupby("c2", as_index=False).mean().c2
- assert_series_equal(result1, result2)
-
- def test_groupby_aggregation_mixed_dtype(self):
-
- # GH 6212
- expected = DataFrame({
- 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1],
- 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]},
- index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99),
- ('big', 'damp'),
- ('blue', 'dry'),
- ('red', 'red'), ('red', 'wet')],
- names=['by1', 'by2']))
-
- df = DataFrame({
- 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
- 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
- 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan,
- 12],
- 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99,
- np.nan, np.nan]
- })
-
- g = df.groupby(['by1', 'by2'])
- result = g[['v1', 'v2']].mean()
- assert_frame_equal(result, expected)
-
- def test_groupby_dtype_inference_empty(self):
- # GH 6733
- df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
- assert df['x'].dtype == np.float64
-
- result = df.groupby('x').first()
- exp_index = Index([], name='x', dtype=np.float64)
- expected = DataFrame({'range': Series(
- [], index=exp_index, dtype='int64')})
- assert_frame_equal(result, expected, by_blocks=True)
-
- def test_groupby_list_infer_array_like(self):
- result = self.df.groupby(list(self.df['A'])).mean()
- expected = self.df.groupby(self.df['A']).mean()
- assert_frame_equal(result, expected, check_names=False)
-
- pytest.raises(Exception, self.df.groupby, list(self.df['A'][:-1]))
-
- # pathological case of ambiguity
- df = DataFrame({'foo': [0, 1],
- 'bar': [3, 4],
- 'val': np.random.randn(2)})
-
- result = df.groupby(['foo', 'bar']).mean()
- expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
-
- def test_groupby_keys_same_size_as_index(self):
- # GH 11185
- freq = 's'
- index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
- periods=2, freq=freq)
- df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
- 'metric', 'values'
- ], index=index)
- result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
- expected = df.set_index([df.index, 'metric'])
-
- assert_frame_equal(result, expected)
-
- def test_groupby_one_row(self):
- # GH 11741
- df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
- pytest.raises(KeyError, df1.groupby, 'Z')
- df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
- pytest.raises(KeyError, df2.groupby, 'Z')
-
- def test_groupby_nat_exclude(self):
- # GH 6992
- df = pd.DataFrame(
- {'values': np.random.randn(8),
- 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
- '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
- pd.Timestamp('2013-01-01')],
- 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
- grouped = df.groupby('dt')
-
- expected = [pd.Index([1, 7]), pd.Index([3, 5])]
- keys = sorted(grouped.groups.keys())
- assert len(keys) == 2
- for k, e in zip(keys, expected):
- # grouped.groups keys are np.datetime64 with system tz
- # not to be affected by tz, only compare values
- tm.assert_index_equal(grouped.groups[k], e)
-
- # confirm obj is not filtered
- tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
- assert grouped.ngroups == 2
-
- expected = {
- Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
- Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
- }
-
- for k in grouped.indices:
- tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
-
- tm.assert_frame_equal(
- grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
- tm.assert_frame_equal(
- grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
+ data = read_csv(StringIO(data), index_col=0)
+
+ grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
+
+ result = grouped.agg(np.mean)
+ expected = grouped.mean()
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_series_with_name(df):
+ result = df.groupby(df['A']).mean()
+ result2 = df.groupby(df['A'], as_index=False).mean()
+ assert result.index.name == 'A'
+ assert 'A' in result2
+
+ result = df.groupby([df['A'], df['B']]).mean()
+ result2 = df.groupby([df['A'], df['B']],
+ as_index=False).mean()
+ assert result.index.names == ('A', 'B')
+ assert 'A' in result2
+ assert 'B' in result2
+
+
+def test_seriesgroupby_name_attr(df):
+ # GH 6265
+ result = df.groupby('A')['C']
+ assert result.count().name == 'C'
+ assert result.mean().name == 'C'
+
+ testFunc = lambda x: np.sum(x) * 2
+ assert result.agg(testFunc).name == 'C'
+
+
+def test_consistency_name():
+ # GH 12363
+
+ df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+ 'foo', 'bar', 'foo', 'foo'],
+ 'B': ['one', 'one', 'two', 'two',
+ 'two', 'two', 'one', 'two'],
+ 'C': np.random.randn(8) + 1.0,
+ 'D': np.arange(8)})
+
+ expected = df.groupby(['A']).B.count()
+ result = df.B.groupby(df.A).count()
+ assert_series_equal(result, expected)
+
+
+def test_groupby_name_propagation(df):
+ # GH 6124
+ def summarize(df, name=None):
+ return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
+
+ def summarize_random_name(df):
+ # Provide a different name for each Series. In this case, groupby
+ # should not attempt to propagate the Series name since they are
+ # inconsistent.
+ return Series({
+ 'count': 1,
+ 'mean': 2,
+ 'omissions': 3,
+ }, name=df.iloc[0]['A'])
+
+ metrics = df.groupby('A').apply(summarize)
+ assert metrics.columns.name is None
+ metrics = df.groupby('A').apply(summarize, 'metrics')
+ assert metrics.columns.name == 'metrics'
+ metrics = df.groupby('A').apply(summarize_random_name)
+ assert metrics.columns.name is None
+
+
+def test_groupby_nonstring_columns():
+ df = DataFrame([np.arange(10) for x in range(10)])
+ grouped = df.groupby(0)
+ result = grouped.mean()
+ expected = df.groupby(df[0]).mean()
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_mixed_type_columns():
+ # GH 13432, unorderable types in py3
+ df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
+ expected = DataFrame([[1, 2]], columns=['B', 0],
+ index=Index([0], name='A'))
+
+ result = df.groupby('A').first()
+ tm.assert_frame_equal(result, expected)
+
+ result = df.groupby('A').sum()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_cython_grouper_series_bug_noncontig():
+ arr = np.empty((100, 100))
+ arr.fill(np.nan)
+ obj = Series(arr[:, 0], index=lrange(100))
+ inds = np.tile(lrange(10), 10)
+
+ result = obj.groupby(inds).agg(Series.median)
+ assert result.isna().all()
+
+
+def test_series_grouper_noncontig_index():
+ index = Index(tm.rands_array(10, 100))
+
+ values = Series(np.random.randn(50), index=index[::2])
+ labels = np.random.randint(0, 5, 50)
+
+ # it works!
+ grouped = values.groupby(labels)
+
+ # accessing the index elements causes segfault
+ f = lambda x: len(set(map(id, x.index)))
+ grouped.agg(f)
+
+def test_convert_objects_leave_decimal_alone():
+
+ s = Series(lrange(5))
+ labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
+
+ def convert_fast(x):
+ return Decimal(str(x.mean()))
+
+ def convert_force_pure(x):
+ # base will be length 0
+ assert (len(x.base) > 0)
+ return Decimal(str(x.mean()))
+
+ grouped = s.groupby(labels)
+
+ result = grouped.agg(convert_fast)
+ assert result.dtype == np.object_
+ assert isinstance(result[0], Decimal)
+
+ result = grouped.agg(convert_force_pure)
+ assert result.dtype == np.object_
+ assert isinstance(result[0], Decimal)
+
+
+def test_groupby_dtype_inference_empty():
+ # GH 6733
+ df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
+ assert df['x'].dtype == np.float64
+
+ result = df.groupby('x').first()
+ exp_index = Index([], name='x', dtype=np.float64)
+ expected = DataFrame({'range': Series(
+ [], index=exp_index, dtype='int64')})
+ assert_frame_equal(result, expected, by_blocks=True)
+
+
+def test_groupby_list_infer_array_like(df):
+ result = df.groupby(list(df['A'])).mean()
+ expected = df.groupby(df['A']).mean()
+ assert_frame_equal(result, expected, check_names=False)
+
+ pytest.raises(Exception, df.groupby, list(df['A'][:-1]))
+
+ # pathological case of ambiguity
+ df = DataFrame({'foo': [0, 1],
+ 'bar': [3, 4],
+ 'val': np.random.randn(2)})
+
+ result = df.groupby(['foo', 'bar']).mean()
+ expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
+
+
+def test_groupby_keys_same_size_as_index():
+ # GH 11185
+ freq = 's'
+ index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
+ periods=2, freq=freq)
+ df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
+ 'metric', 'values'
+ ], index=index)
+ result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
+ expected = df.set_index([df.index, 'metric'])
+
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_one_row():
+ # GH 11741
+ df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
+ pytest.raises(KeyError, df1.groupby, 'Z')
+ df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
+ pytest.raises(KeyError, df2.groupby, 'Z')
+
+
+def test_groupby_nat_exclude():
+ # GH 6992
+ df = pd.DataFrame(
+ {'values': np.random.randn(8),
+ 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
+ '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
+ pd.Timestamp('2013-01-01')],
+ 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
+ grouped = df.groupby('dt')
+
+ expected = [pd.Index([1, 7]), pd.Index([3, 5])]
+ keys = sorted(grouped.groups.keys())
+ assert len(keys) == 2
+ for k, e in zip(keys, expected):
+ # grouped.groups keys are np.datetime64 with system tz
+ # not to be affected by tz, only compare values
+ tm.assert_index_equal(grouped.groups[k], e)
+
+ # confirm obj is not filtered
+ tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
+ assert grouped.ngroups == 2
+
+ expected = {
+ Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
+ Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
+ }
+
+ for k in grouped.indices:
+ tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
+
+ tm.assert_frame_equal(
+ grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
+ tm.assert_frame_equal(
+ grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
+
+ pytest.raises(KeyError, grouped.get_group, pd.NaT)
+
+ nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
+ 'nat': [pd.NaT, pd.NaT, pd.NaT]})
+ assert nan_df['nan'].dtype == 'float64'
+ assert nan_df['nat'].dtype == 'datetime64[ns]'
+
+ for key in ['nan', 'nat']:
+ grouped = nan_df.groupby(key)
+ assert grouped.groups == {}
+ assert grouped.ngroups == 0
+ assert grouped.indices == {}
+ pytest.raises(KeyError, grouped.get_group, np.nan)
pytest.raises(KeyError, grouped.get_group, pd.NaT)
- nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
- 'nat': [pd.NaT, pd.NaT, pd.NaT]})
- assert nan_df['nan'].dtype == 'float64'
- assert nan_df['nat'].dtype == 'datetime64[ns]'
-
- for key in ['nan', 'nat']:
- grouped = nan_df.groupby(key)
- assert grouped.groups == {}
- assert grouped.ngroups == 0
- assert grouped.indices == {}
- pytest.raises(KeyError, grouped.get_group, np.nan)
- pytest.raises(KeyError, grouped.get_group, pd.NaT)
-
- def test_sparse_friendly(self):
- sdf = self.df[['C', 'D']].to_sparse()
- with catch_warnings(record=True):
- panel = tm.makePanel()
- tm.add_nans(panel)
-
- def _check_work(gp):
- gp.mean()
- gp.agg(np.mean)
- dict(iter(gp))
-
- # it works!
- _check_work(sdf.groupby(lambda x: x // 2))
- _check_work(sdf['C'].groupby(lambda x: x // 2))
- _check_work(sdf.groupby(self.df['A']))
-
- # do this someday
- # _check_work(panel.groupby(lambda x: x.month, axis=1))
-
- def test_panel_groupby(self):
- with catch_warnings(record=True):
- self.panel = tm.makePanel()
- tm.add_nans(self.panel)
- grouped = self.panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
- axis='items')
- agged = grouped.mean()
- agged2 = grouped.agg(lambda x: x.mean('items'))
-
- tm.assert_panel_equal(agged, agged2)
-
- tm.assert_index_equal(agged.items, Index([0, 1]))
-
- grouped = self.panel.groupby(lambda x: x.month, axis='major')
- agged = grouped.mean()
-
- exp = Index(sorted(list(set(self.panel.major_axis.month))))
- tm.assert_index_equal(agged.major_axis, exp)
-
- grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
- axis='minor')
- agged = grouped.mean()
- tm.assert_index_equal(agged.minor_axis, Index([0, 1]))
-
- def test_groupby_2d_malformed(self):
- d = DataFrame(index=lrange(2))
- d['group'] = ['g1', 'g2']
- d['zeros'] = [0, 0]
- d['ones'] = [1, 1]
- d['label'] = ['l1', 'l2']
- tmp = d.groupby(['group']).mean()
- res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
- tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
- tm.assert_numpy_array_equal(tmp.values, res_values)
-
- def test_int32_overflow(self):
- B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
- ))
- A = np.arange(25000)
- df = DataFrame({'A': A,
- 'B': B,
- 'C': A,
- 'D': B,
- 'E': np.random.randn(25000)})
-
- left = df.groupby(['A', 'B', 'C', 'D']).sum()
- right = df.groupby(['D', 'C', 'B', 'A']).sum()
- assert len(left) == len(right)
-
- def test_groupby_sort_multi(self):
- df = DataFrame({'a': ['foo', 'bar', 'baz'],
- 'b': [3, 2, 1],
- 'c': [0, 1, 2],
- 'd': np.random.randn(3)})
-
- tups = lmap(tuple, df[['a', 'b', 'c']].values)
- tups = com._asarray_tuplesafe(tups)
- result = df.groupby(['a', 'b', 'c'], sort=True).sum()
- tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
- tups = lmap(tuple, df[['c', 'a', 'b']].values)
- tups = com._asarray_tuplesafe(tups)
- result = df.groupby(['c', 'a', 'b'], sort=True).sum()
- tm.assert_numpy_array_equal(result.index.values, tups)
+def test_sparse_friendly(df):
+ sdf = df[['C', 'D']].to_sparse()
+ with catch_warnings(record=True):
+ panel = tm.makePanel()
+ tm.add_nans(panel)
+
+ def _check_work(gp):
+ gp.mean()
+ gp.agg(np.mean)
+ dict(iter(gp))
+
+ # it works!
+ _check_work(sdf.groupby(lambda x: x // 2))
+ _check_work(sdf['C'].groupby(lambda x: x // 2))
+ _check_work(sdf.groupby(df['A']))
+
+ # do this someday
+ # _check_work(panel.groupby(lambda x: x.month, axis=1))
+
+
+def test_panel_groupby():
+ with catch_warnings(record=True):
+ panel = tm.makePanel()
+ tm.add_nans(panel)
+ grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
+ axis='items')
+ agged = grouped.mean()
+ agged2 = grouped.agg(lambda x: x.mean('items'))
+
+ tm.assert_panel_equal(agged, agged2)
+
+ tm.assert_index_equal(agged.items, Index([0, 1]))
- tups = lmap(tuple, df[['b', 'c', 'a']].values)
+ grouped = panel.groupby(lambda x: x.month, axis='major')
+ agged = grouped.mean()
+
+ exp = Index(sorted(list(set(panel.major_axis.month))))
+ tm.assert_index_equal(agged.major_axis, exp)
+
+ grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
+ axis='minor')
+ agged = grouped.mean()
+ tm.assert_index_equal(agged.minor_axis, Index([0, 1]))
+
+
+def test_groupby_2d_malformed():
+ d = DataFrame(index=lrange(2))
+ d['group'] = ['g1', 'g2']
+ d['zeros'] = [0, 0]
+ d['ones'] = [1, 1]
+ d['label'] = ['l1', 'l2']
+ tmp = d.groupby(['group']).mean()
+ res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
+ tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
+ tm.assert_numpy_array_equal(tmp.values, res_values)
+
+
+def test_int32_overflow():
+ B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
+ ))
+ A = np.arange(25000)
+ df = DataFrame({'A': A,
+ 'B': B,
+ 'C': A,
+ 'D': B,
+ 'E': np.random.randn(25000)})
+
+ left = df.groupby(['A', 'B', 'C', 'D']).sum()
+ right = df.groupby(['D', 'C', 'B', 'A']).sum()
+ assert len(left) == len(right)
+
+
+def test_groupby_sort_multi():
+ df = DataFrame({'a': ['foo', 'bar', 'baz'],
+ 'b': [3, 2, 1],
+ 'c': [0, 1, 2],
+ 'd': np.random.randn(3)})
+
+ tups = lmap(tuple, df[['a', 'b', 'c']].values)
+ tups = com._asarray_tuplesafe(tups)
+ result = df.groupby(['a', 'b', 'c'], sort=True).sum()
+ tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
+
+ tups = lmap(tuple, df[['c', 'a', 'b']].values)
+ tups = com._asarray_tuplesafe(tups)
+ result = df.groupby(['c', 'a', 'b'], sort=True).sum()
+ tm.assert_numpy_array_equal(result.index.values, tups)
+
+ tups = lmap(tuple, df[['b', 'c', 'a']].values)
+ tups = com._asarray_tuplesafe(tups)
+ result = df.groupby(['b', 'c', 'a'], sort=True).sum()
+ tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
+
+ df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
+ 'b': [0, 0, 0, 1, 1, 1],
+ 'd': np.random.randn(6)})
+ grouped = df.groupby(['a', 'b'])['d']
+ result = grouped.sum()
+
+ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
+ tups = lmap(tuple, df[keys].values)
tups = com._asarray_tuplesafe(tups)
- result = df.groupby(['b', 'c', 'a'], sort=True).sum()
- tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
-
- df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
- 'b': [0, 0, 0, 1, 1, 1],
- 'd': np.random.randn(6)})
- grouped = df.groupby(['a', 'b'])['d']
- result = grouped.sum()
- _check_groupby(df, result, ['a', 'b'], 'd')
-
- def test_intercept_builtin_sum(self):
- s = Series([1., 2., np.nan, 3.])
- grouped = s.groupby([0, 1, 2, 2])
-
- result = grouped.agg(builtins.sum)
- result2 = grouped.apply(builtins.sum)
- expected = grouped.sum()
- assert_series_equal(result, expected)
- assert_series_equal(result2, expected)
-
- def test_rank_apply(self):
- lev1 = tm.rands_array(10, 100)
- lev2 = tm.rands_array(10, 130)
- lab1 = np.random.randint(0, 100, size=500)
- lab2 = np.random.randint(0, 130, size=500)
-
- df = DataFrame({'value': np.random.randn(500),
- 'key1': lev1.take(lab1),
- 'key2': lev2.take(lab2)})
-
- result = df.groupby(['key1', 'key2']).value.rank()
-
- expected = []
- for key, piece in df.groupby(['key1', 'key2']):
- expected.append(piece.value.rank())
- expected = concat(expected, axis=0)
- expected = expected.reindex(result.index)
- assert_series_equal(result, expected)
-
- result = df.groupby(['key1', 'key2']).value.rank(pct=True)
-
- expected = []
- for key, piece in df.groupby(['key1', 'key2']):
- expected.append(piece.value.rank(pct=True))
- expected = concat(expected, axis=0)
- expected = expected.reindex(result.index)
- assert_series_equal(result, expected)
-
- @pytest.mark.parametrize("grps", [
- ['qux'], ['qux', 'quux']])
- @pytest.mark.parametrize("vals", [
- [2, 2, 8, 2, 6],
- [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'),
- pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
- pd.Timestamp('2018-01-06')]])
- @pytest.mark.parametrize("ties_method,ascending,pct,exp", [
- ('average', True, False, [2., 2., 5., 2., 4.]),
- ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
- ('average', False, False, [4., 4., 1., 4., 2.]),
- ('average', False, True, [.8, .8, .2, .8, .4]),
- ('min', True, False, [1., 1., 5., 1., 4.]),
- ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
- ('min', False, False, [3., 3., 1., 3., 2.]),
- ('min', False, True, [.6, .6, .2, .6, .4]),
- ('max', True, False, [3., 3., 5., 3., 4.]),
- ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
- ('max', False, False, [5., 5., 1., 5., 2.]),
- ('max', False, True, [1., 1., .2, 1., .4]),
- ('first', True, False, [1., 2., 5., 3., 4.]),
- ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
- ('first', False, False, [3., 4., 1., 5., 2.]),
- ('first', False, True, [.6, .8, .2, 1., .4]),
- ('dense', True, False, [1., 1., 3., 1., 2.]),
- ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]),
- ('dense', False, False, [3., 3., 1., 3., 2.]),
- ('dense', False, True, [.6, .6, .2, .6, .4]),
- ])
- def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp):
- key = np.repeat(grps, len(vals))
- vals = vals * len(grps)
- df = DataFrame({'key': key, 'val': vals})
- result = df.groupby('key').rank(method=ties_method,
- ascending=ascending, pct=pct)
-
- exp_df = DataFrame(exp * len(grps), columns=['val'])
- assert_frame_equal(result, exp_df)
-
- @pytest.mark.parametrize("grps", [
- ['qux'], ['qux', 'quux']])
- @pytest.mark.parametrize("vals", [
- [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats
- [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
- pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
- pd.Timestamp('2018-01-06'), np.nan, np.nan]
- ])
- @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [
- ('average', True, 'keep', False,
- [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]),
- ('average', True, 'keep', True,
- [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]),
- ('average', False, 'keep', False,
- [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]),
- ('average', False, 'keep', True,
- [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]),
- ('min', True, 'keep', False,
- [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]),
- ('min', True, 'keep', True,
- [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
- ('min', False, 'keep', False,
- [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
- ('min', False, 'keep', True,
- [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
- ('max', True, 'keep', False,
- [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]),
- ('max', True, 'keep', True,
- [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
- ('max', False, 'keep', False,
- [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]),
- ('max', False, 'keep', True,
- [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
- ('first', True, 'keep', False,
- [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]),
- ('first', True, 'keep', True,
- [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
- ('first', False, 'keep', False,
- [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]),
- ('first', False, 'keep', True,
- [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
- ('dense', True, 'keep', False,
- [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
- ('dense', True, 'keep', True,
- [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]),
- ('dense', False, 'keep', False,
- [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
- ('dense', False, 'keep', True,
- [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
- ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
- ('average', True, 'no_na', True,
- [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
- ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]),
- ('average', False, 'no_na', True,
- [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]),
- ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]),
- ('min', True, 'no_na', True,
- [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]),
- ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]),
- ('min', False, 'no_na', True,
- [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]),
- ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]),
- ('max', True, 'no_na', True,
- [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]),
- ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]),
- ('max', False, 'no_na', True,
- [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]),
- ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]),
- ('first', True, 'no_na', True,
- [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]),
- ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]),
- ('first', False, 'no_na', True,
- [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
- ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
- ('dense', True, 'no_na', True,
- [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]),
- ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
- ('dense', False, 'no_na', True,
- [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5])
- ])
- def test_rank_args_missing(self, grps, vals, ties_method, ascending,
- na_option, pct, exp):
- key = np.repeat(grps, len(vals))
- vals = vals * len(grps)
- df = DataFrame({'key': key, 'val': vals})
- result = df.groupby('key').rank(method=ties_method,
- ascending=ascending,
- na_option=na_option, pct=pct)
-
- exp_df = DataFrame(exp * len(grps), columns=['val'])
- assert_frame_equal(result, exp_df)
-
- @pytest.mark.parametrize("pct,exp", [
- (False, [3., 3., 3., 3., 3.]),
- (True, [.6, .6, .6, .6, .6])])
- def test_rank_resets_each_group(self, pct, exp):
- df = DataFrame(
- {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'],
- 'val': [1] * 10}
- )
- result = df.groupby('key').rank(pct=pct)
- exp_df = DataFrame(exp * 2, columns=['val'])
- assert_frame_equal(result, exp_df)
-
- def test_rank_avg_even_vals(self):
- df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4})
- result = df.groupby('key').rank()
- exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val'])
- assert_frame_equal(result, exp_df)
-
- @pytest.mark.parametrize("ties_method", [
- 'average', 'min', 'max', 'first', 'dense'])
- @pytest.mark.parametrize("ascending", [True, False])
- @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
- @pytest.mark.parametrize("pct", [True, False])
- @pytest.mark.parametrize("vals", [
- ['bar', 'bar', 'foo', 'bar', 'baz'],
- ['bar', np.nan, 'foo', np.nan, 'baz']
- ])
- def test_rank_object_raises(self, ties_method, ascending, na_option,
- pct, vals):
- df = DataFrame({'key': ['foo'] * 5, 'val': vals})
- with tm.assert_raises_regex(TypeError, "not callable"):
- df.groupby('key').rank(method=ties_method,
- ascending=ascending,
- na_option=na_option, pct=pct)
-
- @pytest.mark.parametrize("agg_func", ['any', 'all'])
- @pytest.mark.parametrize("skipna", [True, False])
- @pytest.mark.parametrize("vals", [
- ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''],
- [1, 2, 3], [1, 0, 0], [0, 0, 0],
- [1., 2., 3.], [1., 0., 0.], [0., 0., 0.],
- [True, True, True], [True, False, False], [False, False, False],
- [np.nan, np.nan, np.nan]
- ])
- def test_groupby_bool_aggs(self, agg_func, skipna, vals):
- df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2})
-
- # Figure out expectation using Python builtin
- exp = getattr(compat.builtins, agg_func)(vals)
-
- # edge case for missing data with skipna and 'any'
- if skipna and all(isna(vals)) and agg_func == 'any':
- exp = False
-
- exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index(
- ['a', 'b'], name='key'))
- result = getattr(df.groupby('key'), agg_func)(skipna=skipna)
- assert_frame_equal(result, exp_df)
-
- def test_dont_clobber_name_column(self):
- df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
- 'name': ['foo', 'bar', 'baz'] * 2})
-
- result = df.groupby('key').apply(lambda x: x)
- assert_frame_equal(result, df)
-
- def test_skip_group_keys(self):
- from pandas import concat
-
- tsf = tm.makeTimeDataFrame()
-
- grouped = tsf.groupby(lambda x: x.month, group_keys=False)
- result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
-
- pieces = []
- for key, group in grouped:
- pieces.append(group.sort_values(by='A')[:3])
-
- expected = concat(pieces)
- assert_frame_equal(result, expected)
-
- grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
- result = grouped.apply(lambda x: x.sort_values()[:3])
-
- pieces = []
- for key, group in grouped:
- pieces.append(group.sort_values()[:3])
-
- expected = concat(pieces)
- assert_series_equal(result, expected)
-
- def test_no_nonsense_name(self):
- # GH #995
- s = self.frame['C'].copy()
- s.name = None
-
- result = s.groupby(self.frame['A']).agg(np.sum)
- assert result.name is None
-
- def test_multifunc_sum_bug(self):
- # GH #1065
- x = DataFrame(np.arange(9).reshape(3, 3))
- x['test'] = 0
- x['fl'] = [1.3, 1.5, 1.6]
-
- grouped = x.groupby('test')
- result = grouped.agg({'fl': 'sum', 2: 'size'})
- assert result['fl'].dtype == np.float64
-
- def test_handle_dict_return_value(self):
- def f(group):
- return {'max': group.max(), 'min': group.min()}
-
- def g(group):
- return Series({'max': group.max(), 'min': group.min()})
-
- result = self.df.groupby('A')['C'].apply(f)
- expected = self.df.groupby('A')['C'].apply(g)
-
- assert isinstance(result, Series)
- assert_series_equal(result, expected)
-
- def test_set_group_name(self):
- def f(group):
- assert group.name is not None
- return group
-
- def freduce(group):
- assert group.name is not None
- return group.sum()
-
- def foo(x):
- return freduce(x)
-
- def _check_all(grouped):
- # make sure all these work
- grouped.apply(f)
- grouped.aggregate(freduce)
- grouped.aggregate({'C': freduce, 'D': freduce})
- grouped.transform(f)
-
- grouped['C'].apply(f)
- grouped['C'].aggregate(freduce)
- grouped['C'].aggregate([freduce, foo])
- grouped['C'].transform(f)
+ expected = f(df.groupby(tups)[field])
+ for k, v in compat.iteritems(expected):
+ assert (result[k] == v)
- _check_all(self.df.groupby('A'))
- _check_all(self.df.groupby(['A', 'B']))
-
- def test_group_name_available_in_inference_pass(self):
- # gh-15062
- df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
-
- names = []
-
- def f(group):
- names.append(group.name)
- return group.copy()
-
- df.groupby('a', sort=False, group_keys=False).apply(f)
- # we expect 2 zeros because we call ``f`` once to see if a faster route
- # can be used.
- expected_names = [0, 0, 1, 2]
- assert names == expected_names
-
- def test_no_dummy_key_names(self):
- # see gh-1291
- result = self.df.groupby(self.df['A'].values).sum()
- assert result.index.name is None
-
- result = self.df.groupby([self.df['A'].values, self.df['B'].values
- ]).sum()
- assert result.index.names == (None, None)
-
- def test_groupby_sort_multiindex_series(self):
- # series multiindex groupby sort argument was not being passed through
- # _compress_group_index
- # GH 9444
- index = MultiIndex(levels=[[1, 2], [1, 2]],
- labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
- names=['a', 'b'])
- mseries = Series([0, 1, 2, 3, 4, 5], index=index)
- index = MultiIndex(levels=[[1, 2], [1, 2]],
- labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
- mseries_result = Series([0, 2, 4], index=index)
-
- result = mseries.groupby(level=['a', 'b'], sort=False).first()
- assert_series_equal(result, mseries_result)
- result = mseries.groupby(level=['a', 'b'], sort=True).first()
- assert_series_equal(result, mseries_result.sort_index())
-
- def test_groupby_reindex_inside_function(self):
-
- periods = 1000
- ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
- df = DataFrame({'high': np.arange(
- periods), 'low': np.arange(periods)}, index=ind)
-
- def agg_before(hour, func, fix=False):
- """
- Run an aggregate func on the subset of data.
- """
-
- def _func(data):
- d = data.loc[data.index.map(
- lambda x: x.hour < 11)].dropna()
- if fix:
- data[data.index[0]]
- if len(d) == 0:
- return None
- return func(d)
-
- return _func
-
- def afunc(data):
- d = data.select(lambda x: x.hour < 11).dropna()
- return np.max(d)
-
- grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
- closure_bad = grouped.agg({'high': agg_before(11, np.max)})
- closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
-
- assert_frame_equal(closure_bad, closure_good)
-
- def test_cython_median(self):
- df = DataFrame(np.random.randn(1000))
- df.values[::2] = np.nan
-
- labels = np.random.randint(0, 50, size=1000).astype(float)
- labels[::17] = np.nan
-
- result = df.groupby(labels).median()
- exp = df.groupby(labels).agg(nanops.nanmedian)
- assert_frame_equal(result, exp)
-
- df = DataFrame(np.random.randn(1000, 5))
- rs = df.groupby(labels).agg(np.median)
- xp = df.groupby(labels).median()
- assert_frame_equal(rs, xp)
-
- def test_median_empty_bins(self):
- df = pd.DataFrame(np.random.randint(0, 44, 500))
-
- grps = range(0, 55, 5)
- bins = pd.cut(df[0], grps)
-
- result = df.groupby(bins).median()
- expected = df.groupby(bins).agg(lambda x: x.median())
- assert_frame_equal(result, expected)
-
- @pytest.mark.parametrize("dtype", [
- 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
- @pytest.mark.parametrize("method,data", [
- ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
- ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
- ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
- ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
- ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
- 'args': [1]}),
- ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
- 'out_type': 'int64'})
- ])
- def test_groupby_non_arithmetic_agg_types(self, dtype, method, data):
- # GH9311, GH6620
- df = pd.DataFrame(
- [{'a': 1, 'b': 1},
- {'a': 1, 'b': 2},
- {'a': 2, 'b': 3},
- {'a': 2, 'b': 4}])
-
- df['b'] = df.b.astype(dtype)
-
- if 'args' not in data:
- data['args'] = []
-
- if 'out_type' in data:
- out_type = data['out_type']
- else:
- out_type = dtype
-
- exp = data['df']
- df_out = pd.DataFrame(exp)
-
- df_out['b'] = df_out.b.astype(out_type)
- df_out.set_index('a', inplace=True)
-
- grpd = df.groupby('a')
- t = getattr(grpd, method)(*data['args'])
- assert_frame_equal(t, df_out)
-
- def test_groupby_non_arithmetic_agg_intlike_precision(self):
- # GH9311, GH6620
- c = 24650000000000000
-
- inputs = ((Timestamp('2011-01-15 12:50:28.502376'),
- Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c))
-
- for i in inputs:
- df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}])
-
- grp_exp = {'first': {'expected': i[0]},
- 'last': {'expected': i[1]},
- 'min': {'expected': i[0]},
- 'max': {'expected': i[1]},
- 'nth': {'expected': i[1],
- 'args': [1]},
- 'count': {'expected': 2}}
-
- for method, data in compat.iteritems(grp_exp):
- if 'args' not in data:
- data['args'] = []
-
- grpd = df.groupby('a')
- res = getattr(grpd, method)(*data['args'])
- assert res.iloc[0].b == data['expected']
-
- def test_groupby_multiindex_missing_pair(self):
- # GH9049
- df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
- 'group2': ['c', 'c', 'd', 'c'],
- 'value': [1, 1, 1, 5]})
- df = df.set_index(['group1', 'group2'])
- df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
-
- res = df_grouped.agg('sum')
- idx = MultiIndex.from_tuples(
- [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
- exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
-
- tm.assert_frame_equal(res, exp)
-
- def test_groupby_multiindex_not_lexsorted(self):
- # GH 11640
-
- # define the lexsorted version
- lexsorted_mi = MultiIndex.from_tuples(
- [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
- lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
- assert lexsorted_df.columns.is_lexsorted()
-
- # define the non-lexsorted version
- not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
- data=[[1, 'b1', 'c1', 3],
- [1, 'b2', 'c2', 4]])
- not_lexsorted_df = not_lexsorted_df.pivot_table(
- index='a', columns=['b', 'c'], values='d')
- not_lexsorted_df = not_lexsorted_df.reset_index()
- assert not not_lexsorted_df.columns.is_lexsorted()
-
- # compare the results
- tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
-
- expected = lexsorted_df.groupby('a').mean()
- with tm.assert_produces_warning(PerformanceWarning):
- result = not_lexsorted_df.groupby('a').mean()
- tm.assert_frame_equal(expected, result)
-
- # a transforming function should work regardless of sort
- # GH 14776
- df = DataFrame({'x': ['a', 'a', 'b', 'a'],
- 'y': [1, 1, 2, 2],
- 'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
- assert not df.index.is_lexsorted()
-
- for level in [0, 1, [0, 1]]:
- for sort in [False, True]:
- result = df.groupby(level=level, sort=sort).apply(
- DataFrame.drop_duplicates)
- expected = df
- tm.assert_frame_equal(expected, result)
-
- result = df.sort_index().groupby(level=level, sort=sort).apply(
- DataFrame.drop_duplicates)
- expected = df.sort_index()
- tm.assert_frame_equal(expected, result)
-
- def test_gb_apply_list_of_unequal_len_arrays(self):
-
- # GH1738
- df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a',
- 'b', 'b', 'b'],
- 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd',
- 'd', 'd', 'e'],
- 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
- 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]})
- df = df.set_index(['group1', 'group2'])
- df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
-
- def noddy(value, weight):
- out = np.array(value * weight).repeat(3)
- return out
-
- # the kernel function returns arrays of unequal length
- # pandas sniffs the first one, sees it's an array and not
- # a list, and assumed the rest are of equal length
- # and so tries a vstack
-
- # don't die
- df_grouped.apply(lambda x: noddy(x.value, x.weight))
-
- def test_fill_constistency(self):
-
- # GH9221
- # pass thru keyword arguments to the generated wrapper
- # are set if the passed kw is None (only)
- df = DataFrame(index=pd.MultiIndex.from_product(
- [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]),
- columns=Index(
- ['1', '2'], name='id'))
- df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan,
- np.nan, 22, np.nan]
- df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan,
- np.nan, 44, np.nan]
-
- expected = df.groupby(level=0, axis=0).fillna(method='ffill')
- result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T
- assert_frame_equal(result, expected)
-
- def test_index_label_overlaps_location(self):
- # checking we don't have any label/location confusion in the
- # the wake of GH5375
- df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
- g = df.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = df.iloc[[1, 3, 4]]
- assert_frame_equal(actual, expected)
-
- ser = df[0]
- g = ser.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = ser.take([1, 3, 4])
- assert_series_equal(actual, expected)
-
- # ... and again, with a generic Index of floats
- df.index = df.index.astype(float)
- g = df.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = df.iloc[[1, 3, 4]]
- assert_frame_equal(actual, expected)
-
- ser = df[0]
- g = ser.groupby(list('ababb'))
- actual = g.filter(lambda x: len(x) > 2)
- expected = ser.take([1, 3, 4])
- assert_series_equal(actual, expected)
-
- def test_groupby_cumprod(self):
- # GH 4095
- df = pd.DataFrame({'key': ['b'] * 10, 'value': 2})
-
- actual = df.groupby('key')['value'].cumprod()
- expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
- expected.name = 'value'
- tm.assert_series_equal(actual, expected)
-
- df = pd.DataFrame({'key': ['b'] * 100, 'value': 2})
- actual = df.groupby('key')['value'].cumprod()
- # if overflows, groupby product casts to float
- # while numpy passes back invalid values
- df['value'] = df['value'].astype(float)
- expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
- expected.name = 'value'
- tm.assert_series_equal(actual, expected)
-
- def test_ops_general(self):
- ops = [('mean', np.mean),
- ('median', np.median),
- ('std', np.std),
- ('var', np.var),
- ('sum', np.sum),
- ('prod', np.prod),
- ('min', np.min),
- ('max', np.max),
- ('first', lambda x: x.iloc[0]),
- ('last', lambda x: x.iloc[-1]),
- ('count', np.size), ]
- try:
- from scipy.stats import sem
- except ImportError:
- pass
- else:
- ops.append(('sem', sem))
- df = DataFrame(np.random.randn(1000))
- labels = np.random.randint(0, 50, size=1000).astype(float)
-
- for op, targop in ops:
- result = getattr(df.groupby(labels), op)().astype(float)
- expected = df.groupby(labels).agg(targop)
- try:
- tm.assert_frame_equal(result, expected)
- except BaseException as exc:
- exc.args += ('operation: %s' % op, )
- raise
-
- def test_max_nan_bug(self):
- raw = """,Date,app,File
-2013-04-23,2013-04-23 00:00:00,,log080001.log
-2013-05-06,2013-05-06 00:00:00,,log.log
-2013-05-07,2013-05-07 00:00:00,OE,xlsx"""
-
- df = pd.read_csv(StringIO(raw), parse_dates=[0])
- gb = df.groupby('Date')
- r = gb[['File']].max()
- e = gb['File'].max().to_frame()
- tm.assert_frame_equal(r, e)
- assert not r['File'].isna().any()
-
- def test_nlargest(self):
- a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
- b = Series(list('a' * 5 + 'b' * 5))
- gb = a.groupby(b)
- r = gb.nlargest(3)
- e = Series([
- 7, 5, 3, 10, 9, 6
- ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]]))
- tm.assert_series_equal(r, e)
-
- a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
- gb = a.groupby(b)
- e = Series([
- 3, 2, 1, 3, 3, 2
- ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]]))
- assert_series_equal(gb.nlargest(3, keep='last'), e)
-
- def test_nsmallest(self):
- a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
- b = Series(list('a' * 5 + 'b' * 5))
- gb = a.groupby(b)
- r = gb.nsmallest(3)
- e = Series([
- 1, 2, 3, 0, 4, 6
- ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
- tm.assert_series_equal(r, e)
-
- a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
- gb = a.groupby(b)
- e = Series([
- 0, 1, 1, 0, 1, 2
- ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
- assert_series_equal(gb.nsmallest(3, keep='last'), e)
-
- def test_transform_doesnt_clobber_ints(self):
- # GH 7972
- n = 6
- x = np.arange(n)
- df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
- df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
-
- gb = df.groupby('a')
- result = gb.transform('mean')
-
- gb2 = df2.groupby('a')
- expected = gb2.transform('mean')
- tm.assert_frame_equal(result, expected)
-
- def test_groupby_apply_all_none(self):
- # Tests to make sure no errors if apply function returns all None
- # values. Issue 9684.
- test_df = DataFrame({'groups': [0, 0, 1, 1],
- 'random_vars': [8, 7, 4, 5]})
-
- def test_func(x):
- pass
-
- result = test_df.groupby('groups').apply(test_func)
- expected = DataFrame()
- tm.assert_frame_equal(result, expected)
-
- def test_groupby_apply_none_first(self):
- # GH 12824. Tests if apply returns None first.
- test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]})
- test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]})
-
- def test_func(x):
- if x.shape[0] < 2:
+ _check_groupby(df, result, ['a', 'b'], 'd')
+
+
+def test_dont_clobber_name_column():
+ df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
+ 'name': ['foo', 'bar', 'baz'] * 2})
+
+ result = df.groupby('key').apply(lambda x: x)
+ assert_frame_equal(result, df)
+
+
+def test_skip_group_keys():
+
+ tsf = tm.makeTimeDataFrame()
+
+ grouped = tsf.groupby(lambda x: x.month, group_keys=False)
+ result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
+
+ pieces = []
+ for key, group in grouped:
+ pieces.append(group.sort_values(by='A')[:3])
+
+ expected = pd.concat(pieces)
+ assert_frame_equal(result, expected)
+
+ grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
+ result = grouped.apply(lambda x: x.sort_values()[:3])
+
+ pieces = []
+ for key, group in grouped:
+ pieces.append(group.sort_values()[:3])
+
+ expected = pd.concat(pieces)
+ assert_series_equal(result, expected)
+
+
+def test_no_nonsense_name(frame):
+ # GH #995
+ s = frame['C'].copy()
+ s.name = None
+
+ result = s.groupby(frame['A']).agg(np.sum)
+ assert result.name is None
+
+
+def test_multifunc_sum_bug():
+ # GH #1065
+ x = DataFrame(np.arange(9).reshape(3, 3))
+ x['test'] = 0
+ x['fl'] = [1.3, 1.5, 1.6]
+
+ grouped = x.groupby('test')
+ result = grouped.agg({'fl': 'sum', 2: 'size'})
+ assert result['fl'].dtype == np.float64
+
+
+def test_handle_dict_return_value(df):
+ def f(group):
+ return {'max': group.max(), 'min': group.min()}
+
+ def g(group):
+ return Series({'max': group.max(), 'min': group.min()})
+
+ result = df.groupby('A')['C'].apply(f)
+ expected = df.groupby('A')['C'].apply(g)
+
+ assert isinstance(result, Series)
+ assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('grouper', ['A', ['A', 'B']])
+def test_set_group_name(df, grouper):
+ def f(group):
+ assert group.name is not None
+ return group
+
+ def freduce(group):
+ assert group.name is not None
+ return group.sum()
+
+ def foo(x):
+ return freduce(x)
+
+ grouped = df.groupby(grouper)
+
+ # make sure all these work
+ grouped.apply(f)
+ grouped.aggregate(freduce)
+ grouped.aggregate({'C': freduce, 'D': freduce})
+ grouped.transform(f)
+
+ grouped['C'].apply(f)
+ grouped['C'].aggregate(freduce)
+ grouped['C'].aggregate([freduce, foo])
+ grouped['C'].transform(f)
+
+
+def test_group_name_available_in_inference_pass():
+ # gh-15062
+ df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
+
+ names = []
+
+ def f(group):
+ names.append(group.name)
+ return group.copy()
+
+ df.groupby('a', sort=False, group_keys=False).apply(f)
+ # we expect 2 zeros because we call ``f`` once to see if a faster route
+ # can be used.
+ expected_names = [0, 0, 1, 2]
+ assert names == expected_names
+
+
+def test_no_dummy_key_names(df):
+ # see gh-1291
+ result = df.groupby(df['A'].values).sum()
+ assert result.index.name is None
+
+ result = df.groupby([df['A'].values, df['B'].values]).sum()
+ assert result.index.names == (None, None)
+
+
+def test_groupby_sort_multiindex_series():
+ # series multiindex groupby sort argument was not being passed through
+ # _compress_group_index
+ # GH 9444
+ index = MultiIndex(levels=[[1, 2], [1, 2]],
+ labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
+ names=['a', 'b'])
+ mseries = Series([0, 1, 2, 3, 4, 5], index=index)
+ index = MultiIndex(levels=[[1, 2], [1, 2]],
+ labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
+ mseries_result = Series([0, 2, 4], index=index)
+
+ result = mseries.groupby(level=['a', 'b'], sort=False).first()
+ assert_series_equal(result, mseries_result)
+ result = mseries.groupby(level=['a', 'b'], sort=True).first()
+ assert_series_equal(result, mseries_result.sort_index())
+
+
+def test_groupby_reindex_inside_function():
+
+ periods = 1000
+ ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
+ df = DataFrame({'high': np.arange(
+ periods), 'low': np.arange(periods)}, index=ind)
+
+ def agg_before(hour, func, fix=False):
+ """
+ Run an aggregate func on the subset of data.
+ """
+
+ def _func(data):
+ d = data.loc[data.index.map(
+ lambda x: x.hour < 11)].dropna()
+ if fix:
+ data[data.index[0]]
+ if len(d) == 0:
return None
- return x.iloc[[0, -1]]
-
- result1 = test_df1.groupby('groups').apply(test_func)
- result2 = test_df2.groupby('groups').apply(test_func)
- index1 = MultiIndex.from_arrays([[1, 1], [0, 2]],
- names=['groups', None])
- index2 = MultiIndex.from_arrays([[2, 2], [1, 3]],
- names=['groups', None])
- expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]},
- index=index1)
- expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]},
- index=index2)
- tm.assert_frame_equal(result1, expected1)
- tm.assert_frame_equal(result2, expected2)
-
- def test_groupby_preserves_sort(self):
- # Test to ensure that groupby always preserves sort order of original
- # object. Issue #8588 and #9651
-
- df = DataFrame(
- {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
- 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
- 'ints': [8, 7, 4, 5, 2, 9, 1, 1],
- 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
- 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
-
- # Try sorting on different types and with different group types
- for sort_column in ['ints', 'floats', 'strings', ['ints', 'floats'],
- ['ints', 'strings']]:
- for group_column in ['int_groups', 'string_groups',
- ['int_groups', 'string_groups']]:
-
- df = df.sort_values(by=sort_column)
-
- g = df.groupby(group_column)
-
- def test_sort(x):
- assert_frame_equal(x, x.sort_values(by=sort_column))
-
- g.apply(test_sort)
-
- def test_numpy_compat(self):
- # see gh-12811
- df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]})
- g = df.groupby('A')
-
- msg = "numpy operations are not valid with groupby"
-
- for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'):
- tm.assert_raises_regex(UnsupportedFunctionCall, msg,
- getattr(g, func), 1, 2, 3)
- tm.assert_raises_regex(UnsupportedFunctionCall, msg,
- getattr(g, func), foo=1)
-
- def test_group_shift_with_null_key(self):
- # This test is designed to replicate the segfault in issue #13813.
- n_rows = 1200
-
- # Generate a moderately large dataframe with occasional missing
- # values in column `B`, and then group by [`A`, `B`]. This should
- # force `-1` in `labels` array of `g.grouper.group_info` exactly
- # at those places, where the group-by key is partially missing.
- df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
- for i in range(n_rows)], dtype=float,
- columns=["A", "B", "Z"], index=None)
- g = df.groupby(["A", "B"])
-
- expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
- else np.nan)
- for i in range(n_rows)], dtype=float,
- columns=["Z"], index=None)
- result = g.shift(-1)
-
- assert_frame_equal(result, expected)
-
- def test_pivot_table_values_key_error(self):
- # This test is designed to replicate the error in issue #14938
- df = pd.DataFrame({'eventDate':
- pd.date_range(pd.datetime.today(),
- periods=20, freq='M').tolist(),
- 'thename': range(0, 20)})
-
- df['year'] = df.set_index('eventDate').index.year
- df['month'] = df.set_index('eventDate').index.month
-
- with pytest.raises(KeyError):
- df.reset_index().pivot_table(index='year', columns='month',
- values='badname', aggfunc='count')
-
- def test_cummin_cummax(self):
- # GH 15048
- num_types = [np.int32, np.int64, np.float32, np.float64]
- num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
- np.finfo(np.float32).min, np.finfo(np.float64).min]
- num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
- np.finfo(np.float32).max, np.finfo(np.float64).max]
- base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
- 'B': [3, 4, 3, 2, 2, 3, 2, 1]})
- expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
- expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
-
- for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
- df = base_df.astype(dtype)
-
- # cummin
- expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
- result = df.groupby('A').cummin()
- tm.assert_frame_equal(result, expected)
- result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
- tm.assert_frame_equal(result, expected)
-
- # Test cummin w/ min value for dtype
- df.loc[[2, 6], 'B'] = min_val
- expected.loc[[2, 3, 6, 7], 'B'] = min_val
- result = df.groupby('A').cummin()
- tm.assert_frame_equal(result, expected)
- expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
- tm.assert_frame_equal(result, expected)
-
- # cummax
- expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
- result = df.groupby('A').cummax()
- tm.assert_frame_equal(result, expected)
- result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
- tm.assert_frame_equal(result, expected)
-
- # Test cummax w/ max value for dtype
- df.loc[[2, 6], 'B'] = max_val
- expected.loc[[2, 3, 6, 7], 'B'] = max_val
- result = df.groupby('A').cummax()
- tm.assert_frame_equal(result, expected)
- expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
- tm.assert_frame_equal(result, expected)
-
- # Test nan in some values
- base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
- expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
- np.nan, 3, np.nan, 1]})
- result = base_df.groupby('A').cummin()
- tm.assert_frame_equal(result, expected)
- expected = (base_df.groupby('A')
- .B
- .apply(lambda x: x.cummin())
- .to_frame())
- tm.assert_frame_equal(result, expected)
-
- expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
- np.nan, 3, np.nan, 3]})
- result = base_df.groupby('A').cummax()
- tm.assert_frame_equal(result, expected)
- expected = (base_df.groupby('A')
- .B
- .apply(lambda x: x.cummax())
- .to_frame())
- tm.assert_frame_equal(result, expected)
-
- # Test nan in entire column
- base_df['B'] = np.nan
- expected = pd.DataFrame({'B': [np.nan] * 8})
- result = base_df.groupby('A').cummin()
- tm.assert_frame_equal(expected, result)
- result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
- tm.assert_frame_equal(expected, result)
- result = base_df.groupby('A').cummax()
- tm.assert_frame_equal(expected, result)
- result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
- tm.assert_frame_equal(expected, result)
-
- # GH 15561
- df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001'])))
- expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b')
- for method in ['cummax', 'cummin']:
- result = getattr(df.groupby('a')['b'], method)()
- tm.assert_series_equal(expected, result)
-
- # GH 15635
- df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
- result = df.groupby('a').b.cummax()
- expected = pd.Series([2, 1, 2], name='b')
- tm.assert_series_equal(result, expected)
-
- df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
- result = df.groupby('a').b.cummin()
- expected = pd.Series([1, 2, 1], name='b')
- tm.assert_series_equal(result, expected)
-
- @pytest.mark.parametrize('in_vals, out_vals', [
-
- # Basics: strictly increasing (T), strictly decreasing (F),
- # abs val increasing (F), non-strictly increasing (T)
- ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1],
- [True, False, False, True]),
-
- # Test with inf vals
- ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
- [True, False, True, False]),
-
- # Test with nan vals; should always be False
- ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
- [False, False, False, False]),
- ])
- def test_is_monotonic_increasing(self, in_vals, out_vals):
- # GH 17015
- source_dict = {
- 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
- 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
- 'C': in_vals}
- df = pd.DataFrame(source_dict)
- result = df.groupby('B').C.is_monotonic_increasing
- index = Index(list('abcd'), name='B')
- expected = pd.Series(index=index, data=out_vals, name='C')
- tm.assert_series_equal(result, expected)
-
- # Also check result equal to manually taking x.is_monotonic_increasing.
- expected = (
- df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing))
- tm.assert_series_equal(result, expected)
-
- @pytest.mark.parametrize('in_vals, out_vals', [
- # Basics: strictly decreasing (T), strictly increasing (F),
- # abs val decreasing (F), non-strictly increasing (T)
- ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1],
- [True, False, False, True]),
-
- # Test with inf vals
- ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
- [True, True, False, True]),
-
- # Test with nan vals; should always be False
- ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
- [False, False, False, False]),
- ])
- def test_is_monotonic_decreasing(self, in_vals, out_vals):
- # GH 17015
- source_dict = {
- 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
- 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
- 'C': in_vals}
-
- df = pd.DataFrame(source_dict)
- result = df.groupby('B').C.is_monotonic_decreasing
- index = Index(list('abcd'), name='B')
- expected = pd.Series(index=index, data=out_vals, name='C')
- tm.assert_series_equal(result, expected)
-
- def test_apply_numeric_coercion_when_datetime(self):
- # In the past, group-by/apply operations have been over-eager
- # in converting dtypes to numeric, in the presence of datetime
- # columns. Various GH issues were filed, the reproductions
- # for which are here.
-
- # GH 15670
- df = pd.DataFrame({'Number': [1, 2],
- 'Date': ["2017-03-02"] * 2,
- 'Str': ["foo", "inf"]})
- expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
- df.Date = pd.to_datetime(df.Date)
- result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
- tm.assert_series_equal(result['Str'], expected['Str'])
-
- # GH 15421
- df = pd.DataFrame({'A': [10, 20, 30],
- 'B': ['foo', '3', '4'],
- 'T': [pd.Timestamp("12:31:22")] * 3})
-
- def get_B(g):
- return g.iloc[0][['B']]
- result = df.groupby('A').apply(get_B)['B']
- expected = df.B
- expected.index = df.A
- tm.assert_series_equal(result, expected)
-
- # GH 14423
- def predictions(tool):
- out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
- if 'step1' in list(tool.State):
- out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
- if 'step2' in list(tool.State):
- out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
- out['useTime'] = str(
- tool[tool.State == 'step2'].oTime.values[0])
- return out
- df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
- 'State': ['step1', 'step2', 'step1', 'step2'],
- 'oTime': ['', '2016-09-19 05:24:33',
- '', '2016-09-19 23:59:04'],
- 'Machine': ['23', '36L', '36R', '36R']})
- df2 = df1.copy()
- df2.oTime = pd.to_datetime(df2.oTime)
- expected = df1.groupby('Key').apply(predictions).p1
- result = df2.groupby('Key').apply(predictions).p1
- tm.assert_series_equal(expected, result)
-
- def test_pipe(self):
- # Test the pipe method of DataFrameGroupBy.
- # Issue #17871
-
- random_state = np.random.RandomState(1234567890)
-
- df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
- 'foo', 'bar', 'foo', 'foo'],
- 'B': random_state.randn(8),
- 'C': random_state.randn(8)})
-
- def f(dfgb):
- return dfgb.B.max() - dfgb.C.min().min()
-
- def square(srs):
- return srs ** 2
-
- # Note that the transformations are
- # GroupBy -> Series
- # Series -> Series
- # This then chains the GroupBy.pipe and the
- # NDFrame.pipe methods
- result = df.groupby('A').pipe(f).pipe(square)
-
- index = Index([u'bar', u'foo'], dtype='object', name=u'A')
- expected = pd.Series([8.99110003361, 8.17516964785], name='B',
- index=index)
-
- assert_series_equal(expected, result)
-
- def test_pipe_args(self):
- # Test passing args to the pipe method of DataFrameGroupBy.
- # Issue #17871
-
- df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
- 'x': [1.0, 2.0, 3.0, 2.0, 5.0],
- 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
-
- def f(dfgb, arg1):
- return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
- .groupby(dfgb.grouper))
-
- def g(dfgb, arg2):
- return dfgb.sum() / dfgb.sum().sum() + arg2
-
- def h(df, arg3):
- return df.x + df.y - arg3
-
- result = (df
- .groupby('group')
- .pipe(f, 0)
- .pipe(g, 10)
- .pipe(h, 100))
-
- # Assert the results here
- index = pd.Index(['A', 'B', 'C'], name='group')
- expected = pd.Series([-79.5160891089, -78.4839108911, -80],
- index=index)
-
- assert_series_equal(expected, result)
-
- # test SeriesGroupby.pipe
- ser = pd.Series([1, 1, 2, 2, 3, 3])
- result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
-
- expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
-
- assert_series_equal(result, expected)
-
- def test_empty_dataframe_groupby(self):
- # GH8093
- df = DataFrame(columns=['A', 'B', 'C'])
-
- result = df.groupby('A').sum()
- expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
- expected.index.name = 'A'
-
- assert_frame_equal(result, expected)
-
- def test_tuple_warns(self):
- # https://github.com/pandas-dev/pandas/issues/18314
- df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
- 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
- with tm.assert_produces_warning(FutureWarning) as w:
- df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
-
- assert "Interpreting tuple 'by' as a list" in str(w[0].message)
+ return func(d)
+
+ return _func
+
+ def afunc(data):
+ d = data.select(lambda x: x.hour < 11).dropna()
+ return np.max(d)
+
+ grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
+ closure_bad = grouped.agg({'high': agg_before(11, np.max)})
+ closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
+
+ assert_frame_equal(closure_bad, closure_good)
+
+
+def test_groupby_multiindex_missing_pair():
+ # GH9049
+ df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
+ 'group2': ['c', 'c', 'd', 'c'],
+ 'value': [1, 1, 1, 5]})
+ df = df.set_index(['group1', 'group2'])
+ df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
+
+ res = df_grouped.agg('sum')
+ idx = MultiIndex.from_tuples(
+ [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
+ exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
+
+ tm.assert_frame_equal(res, exp)
+
+
+def test_groupby_multiindex_not_lexsorted():
+ # GH 11640
+
+ # define the lexsorted version
+ lexsorted_mi = MultiIndex.from_tuples(
+ [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
+ lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
+ assert lexsorted_df.columns.is_lexsorted()
+
+ # define the non-lexsorted version
+ not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
+ data=[[1, 'b1', 'c1', 3],
+ [1, 'b2', 'c2', 4]])
+ not_lexsorted_df = not_lexsorted_df.pivot_table(
+ index='a', columns=['b', 'c'], values='d')
+ not_lexsorted_df = not_lexsorted_df.reset_index()
+ assert not not_lexsorted_df.columns.is_lexsorted()
+
+ # compare the results
+ tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
+
+ expected = lexsorted_df.groupby('a').mean()
+ with tm.assert_produces_warning(PerformanceWarning):
+ result = not_lexsorted_df.groupby('a').mean()
+ tm.assert_frame_equal(expected, result)
+
+ # a transforming function should work regardless of sort
+ # GH 14776
+ df = DataFrame({'x': ['a', 'a', 'b', 'a'],
+ 'y': [1, 1, 2, 2],
+ 'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
+ assert not df.index.is_lexsorted()
+
+ for level in [0, 1, [0, 1]]:
+ for sort in [False, True]:
+ result = df.groupby(level=level, sort=sort).apply(
+ DataFrame.drop_duplicates)
+ expected = df
+ tm.assert_frame_equal(expected, result)
+
+ result = df.sort_index().groupby(level=level, sort=sort).apply(
+ DataFrame.drop_duplicates)
+ expected = df.sort_index()
+ tm.assert_frame_equal(expected, result)
+
+
+def test_index_label_overlaps_location():
+ # checking we don't have any label/location confusion in the
+ # the wake of GH5375
+ df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
+ g = df.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = df.iloc[[1, 3, 4]]
+ assert_frame_equal(actual, expected)
+
+ ser = df[0]
+ g = ser.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = ser.take([1, 3, 4])
+ assert_series_equal(actual, expected)
+
+ # ... and again, with a generic Index of floats
+ df.index = df.index.astype(float)
+ g = df.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = df.iloc[[1, 3, 4]]
+ assert_frame_equal(actual, expected)
+
+ ser = df[0]
+ g = ser.groupby(list('ababb'))
+ actual = g.filter(lambda x: len(x) > 2)
+ expected = ser.take([1, 3, 4])
+ assert_series_equal(actual, expected)
+
+
+def test_transform_doesnt_clobber_ints():
+ # GH 7972
+ n = 6
+ x = np.arange(n)
+ df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
+ df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
+
+ gb = df.groupby('a')
+ result = gb.transform('mean')
+
+ gb2 = df2.groupby('a')
+ expected = gb2.transform('mean')
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings',
+ ['ints', 'floats'],
+ ['ints', 'strings']])
+@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups',
+ ['int_groups', 'string_groups']])
+def test_groupby_preserves_sort(sort_column, group_column):
+ # Test to ensure that groupby always preserves sort order of original
+ # object. Issue #8588 and #9651
+
+ df = DataFrame(
+ {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
+ 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
+ 'ints': [8, 7, 4, 5, 2, 9, 1, 1],
+ 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
+ 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
+
+ # Try sorting on different types and with different group types
+
+ df = df.sort_values(by=sort_column)
+ g = df.groupby(group_column)
+
+ def test_sort(x):
+ assert_frame_equal(x, x.sort_values(by=sort_column))
+ g.apply(test_sort)
+
+
+def test_group_shift_with_null_key():
+ # This test is designed to replicate the segfault in issue #13813.
+ n_rows = 1200
+
+ # Generate a moderately large dataframe with occasional missing
+ # values in column `B`, and then group by [`A`, `B`]. This should
+ # force `-1` in `labels` array of `g.grouper.group_info` exactly
+ # at those places, where the group-by key is partially missing.
+ df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
+ for i in range(n_rows)], dtype=float,
+ columns=["A", "B", "Z"], index=None)
+ g = df.groupby(["A", "B"])
+
+ expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
+ else np.nan)
+ for i in range(n_rows)], dtype=float,
+ columns=["Z"], index=None)
+ result = g.shift(-1)
+
+ assert_frame_equal(result, expected)
+
- with tm.assert_produces_warning(None):
- df.groupby(('a', 'b')).c.mean()
+def test_pivot_table_values_key_error():
+ # This test is designed to replicate the error in issue #14938
+ df = pd.DataFrame({'eventDate':
+ pd.date_range(pd.datetime.today(),
+ periods=20, freq='M').tolist(),
+ 'thename': range(0, 20)})
- def test_tuple_warns_unhashable(self):
- # https://github.com/pandas-dev/pandas/issues/18314
- business_dates = date_range(start='4/1/2014', end='6/30/2014',
- freq='B')
- df = DataFrame(1, index=business_dates, columns=['a', 'b'])
+ df['year'] = df.set_index('eventDate').index.year
+ df['month'] = df.set_index('eventDate').index.month
+
+ with pytest.raises(KeyError):
+ df.reset_index().pivot_table(index='year', columns='month',
+ values='badname', aggfunc='count')
- with tm.assert_produces_warning(FutureWarning) as w:
- df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
- assert "Interpreting tuple 'by' as a list" in str(w[0].message)
+def test_empty_dataframe_groupby():
+ # GH8093
+ df = DataFrame(columns=['A', 'B', 'C'])
- def test_tuple_correct_keyerror(self):
- # https://github.com/pandas-dev/pandas/issues/18798
- df = pd.DataFrame(1, index=range(3),
- columns=pd.MultiIndex.from_product([[1, 2],
- [3, 4]]))
- with tm.assert_raises_regex(KeyError, "(7, 8)"):
- df.groupby((7, 8)).mean()
+ result = df.groupby('A').sum()
+ expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
+ expected.index.name = 'A'
+ assert_frame_equal(result, expected)
-def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
- tups = lmap(tuple, df[keys].values)
- tups = com._asarray_tuplesafe(tups)
- expected = f(df.groupby(tups)[field])
- for k, v in compat.iteritems(expected):
- assert (result[k] == v)
+
+def test_tuple_warns():
+ # https://github.com/pandas-dev/pandas/issues/18314
+ df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
+ 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
+
+ assert "Interpreting tuple 'by' as a list" in str(w[0].message)
+
+ with tm.assert_produces_warning(None):
+ df.groupby(('a', 'b')).c.mean()
+
+
+def test_tuple_warns_unhashable():
+ # https://github.com/pandas-dev/pandas/issues/18314
+ business_dates = date_range(start='4/1/2014', end='6/30/2014',
+ freq='B')
+ df = DataFrame(1, index=business_dates, columns=['a', 'b'])
+
+ with tm.assert_produces_warning(FutureWarning) as w:
+ df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
+
+ assert "Interpreting tuple 'by' as a list" in str(w[0].message)
+
+
+def test_tuple_correct_keyerror():
+ # https://github.com/pandas-dev/pandas/issues/18798
+ df = pd.DataFrame(1, index=range(3),
+ columns=pd.MultiIndex.from_product([[1, 2],
+ [3, 4]]))
+ with tm.assert_raises_regex(KeyError, "(7, 8)"):
+ df.groupby((7, 8)).mean()
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index 57becd342d370..743237f5b386c 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -9,6 +9,7 @@
Index, MultiIndex, DataFrame, Series, CategoricalIndex)
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
assert_series_equal, assert_almost_equal)
+from pandas.core.groupby.groupby import Grouping
from pandas.compat import lrange, long
from pandas import compat
@@ -16,13 +17,12 @@
import pandas.util.testing as tm
import pandas as pd
-from .common import MixIn
# selection
# --------------------------------
-class TestSelection(MixIn):
+class TestSelection():
def test_select_bad_cols(self):
df = DataFrame([[1, 2]], columns=['A', 'B'])
@@ -48,14 +48,14 @@ def test_groupby_duplicated_column_errormsg(self):
assert c.columns.nlevels == 1
assert c.columns.size == 3
- def test_column_select_via_attr(self):
- result = self.df.groupby('A').C.sum()
- expected = self.df.groupby('A')['C'].sum()
+ def test_column_select_via_attr(self, df):
+ result = df.groupby('A').C.sum()
+ expected = df.groupby('A')['C'].sum()
assert_series_equal(result, expected)
- self.df['mean'] = 1.5
- result = self.df.groupby('A').mean()
- expected = self.df.groupby('A').agg(np.mean)
+ df['mean'] = 1.5
+ result = df.groupby('A').mean()
+ expected = df.groupby('A').agg(np.mean)
assert_frame_equal(result, expected)
def test_getitem_list_of_columns(self):
@@ -96,7 +96,7 @@ def test_getitem_numeric_column_names(self):
# grouping
# --------------------------------
-class TestGrouping(MixIn):
+class TestGrouping():
def test_grouper_index_types(self):
# related GH5375
@@ -291,17 +291,17 @@ def test_grouper_getting_correct_binner(self):
names=['one', 'two']))
assert_frame_equal(result, expected)
- def test_grouper_iter(self):
- assert sorted(self.df.groupby('A').grouper) == ['bar', 'foo']
+ def test_grouper_iter(self, df):
+ assert sorted(df.groupby('A').grouper) == ['bar', 'foo']
- def test_empty_groups(self):
+ def test_empty_groups(self, df):
# see gh-1048
- pytest.raises(ValueError, self.df.groupby, [])
+ pytest.raises(ValueError, df.groupby, [])
- def test_groupby_grouper(self):
- grouped = self.df.groupby('A')
+ def test_groupby_grouper(self, df):
+ grouped = df.groupby('A')
- result = self.df.groupby(grouped.grouper).mean()
+ result = df.groupby(grouped.grouper).mean()
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
@@ -339,10 +339,9 @@ def test_groupby_grouper_f_sanity_checked(self):
pytest.raises(AssertionError, ts.groupby, lambda key: key[0:6])
- def test_grouping_error_on_multidim_input(self):
- from pandas.core.groupby.groupby import Grouping
+ def test_grouping_error_on_multidim_input(self, df):
pytest.raises(ValueError,
- Grouping, self.df.index, self.df[['A', 'A']])
+ Grouping, df.index, df[['A', 'A']])
def test_multiindex_passthru(self):
@@ -354,26 +353,25 @@ def test_multiindex_passthru(self):
result = df.groupby(axis=1, level=[0, 1]).first()
assert_frame_equal(result, df)
- def test_multiindex_negative_level(self):
+ def test_multiindex_negative_level(self, mframe):
# GH 13901
- result = self.mframe.groupby(level=-1).sum()
- expected = self.mframe.groupby(level='second').sum()
+ result = mframe.groupby(level=-1).sum()
+ expected = mframe.groupby(level='second').sum()
assert_frame_equal(result, expected)
- result = self.mframe.groupby(level=-2).sum()
- expected = self.mframe.groupby(level='first').sum()
+ result = mframe.groupby(level=-2).sum()
+ expected = mframe.groupby(level='first').sum()
assert_frame_equal(result, expected)
- result = self.mframe.groupby(level=[-2, -1]).sum()
- expected = self.mframe
+ result = mframe.groupby(level=[-2, -1]).sum()
+ expected = mframe
assert_frame_equal(result, expected)
- result = self.mframe.groupby(level=[-1, 'first']).sum()
- expected = self.mframe.groupby(level=['second', 'first']).sum()
+ result = mframe.groupby(level=[-1, 'first']).sum()
+ expected = mframe.groupby(level=['second', 'first']).sum()
assert_frame_equal(result, expected)
- def test_multifunc_select_col_integer_cols(self):
- df = self.df
+ def test_multifunc_select_col_integer_cols(self, df):
df.columns = np.arange(len(df.columns))
# it works!
@@ -428,9 +426,9 @@ def test_groupby_multiindex_tuple(self):
tm.assert_dict_equal(expected, result)
@pytest.mark.parametrize('sort', [True, False])
- def test_groupby_level(self, sort):
+ def test_groupby_level(self, sort, mframe, df):
# GH 17537
- frame = self.mframe
+ frame = mframe
deleveled = frame.reset_index()
result0 = frame.groupby(level=0, sort=sort).sum()
@@ -464,7 +462,7 @@ def test_groupby_level(self, sort):
assert_frame_equal(result1, expected1.T)
# raise exception for non-MultiIndex
- pytest.raises(ValueError, self.df.groupby, level=1)
+ pytest.raises(ValueError, df.groupby, level=1)
def test_groupby_level_index_names(self):
# GH4014 this used to raise ValueError since 'exp'>1 (in py2)
@@ -496,9 +494,9 @@ def test_groupby_level_with_nas(self, sort):
expected = Series([6., 18.], index=[0.0, 1.0])
assert_series_equal(result, expected)
- def test_groupby_args(self):
+ def test_groupby_args(self, mframe):
# PR8618 and issue 8015
- frame = self.mframe
+ frame = mframe
def j():
frame.groupby()
@@ -516,14 +514,14 @@ def k():
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]]
])
- def test_level_preserve_order(self, sort, labels):
+ def test_level_preserve_order(self, sort, labels, mframe):
# GH 17537
- grouped = self.mframe.groupby(level=0, sort=sort)
+ grouped = mframe.groupby(level=0, sort=sort)
exp_labels = np.array(labels, np.intp)
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
- def test_grouping_labels(self):
- grouped = self.mframe.groupby(self.mframe.index.get_level_values(0))
+ def test_grouping_labels(self, mframe):
+ grouped = mframe.groupby(mframe.index.get_level_values(0))
exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
@@ -531,7 +529,7 @@ def test_grouping_labels(self):
# get_group
# --------------------------------
-class TestGetGroup(MixIn):
+class TestGetGroup():
def test_get_group(self):
with catch_warnings(record=True):
@@ -638,29 +636,28 @@ def test_gb_key_len_equal_axis_len(self):
# groups & iteration
# --------------------------------
-class TestIteration(MixIn):
+class TestIteration():
- def test_groups(self):
- grouped = self.df.groupby(['A'])
+ def test_groups(self, df):
+ grouped = df.groupby(['A'])
groups = grouped.groups
assert groups is grouped.groups # caching works
for k, v in compat.iteritems(grouped.groups):
- assert (self.df.loc[v]['A'] == k).all()
+ assert (df.loc[v]['A'] == k).all()
- grouped = self.df.groupby(['A', 'B'])
+ grouped = df.groupby(['A', 'B'])
groups = grouped.groups
assert groups is grouped.groups # caching works
for k, v in compat.iteritems(grouped.groups):
- assert (self.df.loc[v]['A'] == k[0]).all()
- assert (self.df.loc[v]['B'] == k[1]).all()
+ assert (df.loc[v]['A'] == k[0]).all()
+ assert (df.loc[v]['B'] == k[1]).all()
- def test_grouping_is_iterable(self):
+ def test_grouping_is_iterable(self, tsframe):
# this code path isn't used anywhere else
# not sure it's useful
- grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year
- ])
+ grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
# test it works
for g in grouped.grouper.groupings[0]:
@@ -682,7 +679,7 @@ def test_multi_iter(self):
assert e2 == two
assert_series_equal(three, e3)
- def test_multi_iter_frame(self):
+ def test_multi_iter_frame(self, three_group):
k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
k2 = np.array(['1', '2', '1', '2', '1', '2'])
df = DataFrame({'v1': np.random.randn(6),
@@ -715,7 +712,7 @@ def test_multi_iter_frame(self):
assert len(groups) == 2
# axis = 1
- three_levels = self.three_group.groupby(['A', 'B', 'C']).mean()
+ three_levels = three_group.groupby(['A', 'B', 'C']).mean()
grouped = three_levels.T.groupby(axis=1, level=(1, 2))
for key, group in grouped:
pass
@@ -733,13 +730,13 @@ def test_multi_iter_panel(self):
expected = wp.reindex(major=exp_axis)
assert_panel_equal(group, expected)
- def test_dictify(self):
- dict(iter(self.df.groupby('A')))
- dict(iter(self.df.groupby(['A', 'B'])))
- dict(iter(self.df['C'].groupby(self.df['A'])))
- dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']])))
- dict(iter(self.df.groupby('A')['C']))
- dict(iter(self.df.groupby(['A', 'B'])['C']))
+ def test_dictify(self, df):
+ dict(iter(df.groupby('A')))
+ dict(iter(df.groupby(['A', 'B'])))
+ dict(iter(df['C'].groupby(df['A'])))
+ dict(iter(df['C'].groupby([df['A'], df['B']])))
+ dict(iter(df.groupby('A')['C']))
+ dict(iter(df.groupby(['A', 'B'])['C']))
def test_groupby_with_small_elem(self):
# GH 8542
diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py
index ccde545b5b8e9..a32ba9ad76f14 100644
--- a/pandas/tests/groupby/test_nth.py
+++ b/pandas/tests/groupby/test_nth.py
@@ -7,314 +7,316 @@
assert_produces_warning,
assert_series_equal)
-from .common import MixIn
-
-
-class TestNth(MixIn):
-
- def test_first_last_nth(self):
- # tests for first / last / nth
- grouped = self.df.groupby('A')
- first = grouped.first()
- expected = self.df.loc[[1, 0], ['B', 'C', 'D']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(first, expected)
-
- nth = grouped.nth(0)
- assert_frame_equal(nth, expected)
-
- last = grouped.last()
- expected = self.df.loc[[5, 7], ['B', 'C', 'D']]
- expected.index = Index(['bar', 'foo'], name='A')
- assert_frame_equal(last, expected)
-
- nth = grouped.nth(-1)
- assert_frame_equal(nth, expected)
-
- nth = grouped.nth(1)
- expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy()
- expected.index = Index(['foo', 'bar'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(nth, expected)
-
- # it works!
- grouped['B'].first()
- grouped['B'].last()
- grouped['B'].nth(0)
-
- self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
- assert isna(grouped['B'].first()['foo'])
- assert isna(grouped['B'].last()['foo'])
- assert isna(grouped['B'].nth(0)['foo'])
-
- # v0.14.0 whatsnew
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- result = g.first()
- expected = df.iloc[[1, 2]].set_index('A')
- assert_frame_equal(result, expected)
-
- expected = df.iloc[[1, 2]].set_index('A')
- result = g.nth(0, dropna='any')
- assert_frame_equal(result, expected)
-
- def test_first_last_nth_dtypes(self):
-
- df = self.df_mixed_floats.copy()
- df['E'] = True
- df['F'] = 1
-
- # tests for first / last / nth
- grouped = df.groupby('A')
- first = grouped.first()
- expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(first, expected)
-
- last = grouped.last()
- expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(last, expected)
-
- nth = grouped.nth(1)
- expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
- expected.index = Index(['bar', 'foo'], name='A')
- expected = expected.sort_index()
- assert_frame_equal(nth, expected)
-
- # GH 2763, first/last shifting dtypes
- idx = lrange(10)
- idx.append(9)
- s = Series(data=lrange(11), index=idx, name='IntCol')
- assert s.dtype == 'int64'
- f = s.groupby(level=0).first()
- assert f.dtype == 'int64'
-
- def test_nth(self):
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
-
- assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
- assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
- assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
- assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
- assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
- assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
- assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
- assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
- assert_frame_equal(g[['B']].nth(0),
- df.loc[[0, 2], ['A', 'B']].set_index('A'))
-
- exp = df.set_index('A')
- assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
- assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
-
- exp['B'] = np.nan
- assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
- assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
-
- # out of bounds, regression from 0.13.1
- # GH 6621
- df = DataFrame({'color': {0: 'green',
- 1: 'green',
- 2: 'red',
- 3: 'red',
- 4: 'red'},
- 'food': {0: 'ham',
- 1: 'eggs',
- 2: 'eggs',
- 3: 'ham',
- 4: 'pork'},
- 'two': {0: 1.5456590000000001,
- 1: -0.070345000000000005,
- 2: -2.4004539999999999,
- 3: 0.46206000000000003,
- 4: 0.52350799999999997},
- 'one': {0: 0.56573799999999996,
- 1: -0.9742360000000001,
- 2: 1.033801,
- 3: -0.78543499999999999,
- 4: 0.70422799999999997}}).set_index(['color',
- 'food'])
-
- result = df.groupby(level=0, as_index=False).nth(2)
- expected = df.iloc[[-1]]
- assert_frame_equal(result, expected)
-
- result = df.groupby(level=0, as_index=False).nth(3)
- expected = df.loc[[]]
- assert_frame_equal(result, expected)
-
- # GH 7559
- # from the vbench
- df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
- s = df[1]
- g = df[0]
- expected = s.groupby(g).first()
- expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
- assert_series_equal(expected2, expected, check_names=False)
- assert expected.name == 1
- assert expected2.name == 1
-
- # validate first
- v = s[g == 1].iloc[0]
- assert expected.iloc[0] == v
- assert expected2.iloc[0] == v
-
- # this is NOT the same as .first (as sorted is default!)
- # as it keeps the order in the series (and not the group order)
- # related GH 7287
- expected = s.groupby(g, sort=False).first()
- result = s.groupby(g, sort=False).nth(0, dropna='all')
- assert_series_equal(result, expected)
-
- # doc example
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- # PR 17493, related to issue 11038
- # test Series.nth with True for dropna produces FutureWarning
- with assert_produces_warning(FutureWarning):
- result = g.B.nth(0, dropna=True)
- expected = g.B.first()
- assert_series_equal(result, expected)
-
- # test multiple nth values
- df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
- columns=['A', 'B'])
- g = df.groupby('A')
-
- assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
- assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
- assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
- assert_frame_equal(
- g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
- assert_frame_equal(
- g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
- assert_frame_equal(
- g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
- assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
- assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
-
- business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
- freq='B')
- df = DataFrame(1, index=business_dates, columns=['a', 'b'])
- # get the first, fourth and last two business days for each month
- key = [df.index.year, df.index.month]
- result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
- expected_dates = pd.to_datetime(
- ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
- '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
- '2014/6/27', '2014/6/30'])
- expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
- assert_frame_equal(result, expected)
-
- def test_nth_multi_index(self):
- # PR 9090, related to issue 8979
- # test nth on MultiIndex, should match .first()
- grouped = self.three_group.groupby(['A', 'B'])
- result = grouped.nth(0)
- expected = grouped.first()
- assert_frame_equal(result, expected)
-
- def test_nth_multi_index_as_expected(self):
- # PR 9090, related to issue 8979
- # test nth on MultiIndex
- three_group = DataFrame(
- {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
- 'foo', 'foo', 'foo'],
- 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
- 'two', 'two', 'one'],
- 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
- 'dull', 'shiny', 'shiny', 'shiny']})
- grouped = three_group.groupby(['A', 'B'])
- result = grouped.nth(0)
- expected = DataFrame(
- {'C': ['dull', 'dull', 'dull', 'dull']},
- index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
- ['one', 'two', 'one', 'two']],
- names=['A', 'B']))
- assert_frame_equal(result, expected)
-
- def test_groupby_head_tail(self):
- df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
- g_as = df.groupby('A', as_index=True)
- g_not_as = df.groupby('A', as_index=False)
-
- # as_index= False, much easier
- assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
- assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
-
- empty_not_as = DataFrame(columns=df.columns,
- index=pd.Index([], dtype=df.index.dtype))
- empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype)
- empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype)
- assert_frame_equal(empty_not_as, g_not_as.head(0))
- assert_frame_equal(empty_not_as, g_not_as.tail(0))
- assert_frame_equal(empty_not_as, g_not_as.head(-1))
- assert_frame_equal(empty_not_as, g_not_as.tail(-1))
-
- assert_frame_equal(df, g_not_as.head(7)) # contains all
- assert_frame_equal(df, g_not_as.tail(7))
-
- # as_index=True, (used to be different)
- df_as = df
-
- assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
- assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
-
- empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
- empty_as['A'] = empty_not_as['A'].astype(df.A.dtype)
- empty_as['B'] = empty_not_as['B'].astype(df.B.dtype)
- assert_frame_equal(empty_as, g_as.head(0))
- assert_frame_equal(empty_as, g_as.tail(0))
- assert_frame_equal(empty_as, g_as.head(-1))
- assert_frame_equal(empty_as, g_as.tail(-1))
-
- assert_frame_equal(df_as, g_as.head(7)) # contains all
- assert_frame_equal(df_as, g_as.tail(7))
-
- # test with selection
- assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
- assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
- assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
- assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
-
- assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
- assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
- assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
- assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
-
- def test_group_selection_cache(self):
- # GH 12839 nth, head, and tail should return same result consistently
- df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
- expected = df.iloc[[0, 2]].set_index('A')
-
- g = df.groupby('A')
- result1 = g.head(n=2)
- result2 = g.nth(0)
- assert_frame_equal(result1, df)
- assert_frame_equal(result2, expected)
-
- g = df.groupby('A')
- result1 = g.tail(n=2)
- result2 = g.nth(0)
- assert_frame_equal(result1, df)
- assert_frame_equal(result2, expected)
-
- g = df.groupby('A')
- result1 = g.nth(0)
- result2 = g.head(n=2)
- assert_frame_equal(result1, expected)
- assert_frame_equal(result2, df)
-
- g = df.groupby('A')
- result1 = g.nth(0)
- result2 = g.tail(n=2)
- assert_frame_equal(result1, expected)
- assert_frame_equal(result2, df)
+
+def test_first_last_nth(df):
+ # tests for first / last / nth
+ grouped = df.groupby('A')
+ first = grouped.first()
+ expected = df.loc[[1, 0], ['B', 'C', 'D']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(first, expected)
+
+ nth = grouped.nth(0)
+ assert_frame_equal(nth, expected)
+
+ last = grouped.last()
+ expected = df.loc[[5, 7], ['B', 'C', 'D']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ assert_frame_equal(last, expected)
+
+ nth = grouped.nth(-1)
+ assert_frame_equal(nth, expected)
+
+ nth = grouped.nth(1)
+ expected = df.loc[[2, 3], ['B', 'C', 'D']].copy()
+ expected.index = Index(['foo', 'bar'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(nth, expected)
+
+ # it works!
+ grouped['B'].first()
+ grouped['B'].last()
+ grouped['B'].nth(0)
+
+ df.loc[df['A'] == 'foo', 'B'] = np.nan
+ assert isna(grouped['B'].first()['foo'])
+ assert isna(grouped['B'].last()['foo'])
+ assert isna(grouped['B'].nth(0)['foo'])
+
+ # v0.14.0 whatsnew
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+ result = g.first()
+ expected = df.iloc[[1, 2]].set_index('A')
+ assert_frame_equal(result, expected)
+
+ expected = df.iloc[[1, 2]].set_index('A')
+ result = g.nth(0, dropna='any')
+ assert_frame_equal(result, expected)
+
+
+def test_first_last_nth_dtypes(df_mixed_floats):
+
+ df = df_mixed_floats.copy()
+ df['E'] = True
+ df['F'] = 1
+
+ # tests for first / last / nth
+ grouped = df.groupby('A')
+ first = grouped.first()
+ expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(first, expected)
+
+ last = grouped.last()
+ expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(last, expected)
+
+ nth = grouped.nth(1)
+ expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
+ expected.index = Index(['bar', 'foo'], name='A')
+ expected = expected.sort_index()
+ assert_frame_equal(nth, expected)
+
+ # GH 2763, first/last shifting dtypes
+ idx = lrange(10)
+ idx.append(9)
+ s = Series(data=lrange(11), index=idx, name='IntCol')
+ assert s.dtype == 'int64'
+ f = s.groupby(level=0).first()
+ assert f.dtype == 'int64'
+
+
+def test_nth():
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+
+ assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
+ assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
+ assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
+ assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
+ assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
+ assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
+ assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
+ assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
+ assert_frame_equal(g[['B']].nth(0),
+ df.loc[[0, 2], ['A', 'B']].set_index('A'))
+
+ exp = df.set_index('A')
+ assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
+ assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
+
+ exp['B'] = np.nan
+ assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
+ assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
+
+ # out of bounds, regression from 0.13.1
+ # GH 6621
+ df = DataFrame({'color': {0: 'green',
+ 1: 'green',
+ 2: 'red',
+ 3: 'red',
+ 4: 'red'},
+ 'food': {0: 'ham',
+ 1: 'eggs',
+ 2: 'eggs',
+ 3: 'ham',
+ 4: 'pork'},
+ 'two': {0: 1.5456590000000001,
+ 1: -0.070345000000000005,
+ 2: -2.4004539999999999,
+ 3: 0.46206000000000003,
+ 4: 0.52350799999999997},
+ 'one': {0: 0.56573799999999996,
+ 1: -0.9742360000000001,
+ 2: 1.033801,
+ 3: -0.78543499999999999,
+ 4: 0.70422799999999997}}).set_index(['color',
+ 'food'])
+
+ result = df.groupby(level=0, as_index=False).nth(2)
+ expected = df.iloc[[-1]]
+ assert_frame_equal(result, expected)
+
+ result = df.groupby(level=0, as_index=False).nth(3)
+ expected = df.loc[[]]
+ assert_frame_equal(result, expected)
+
+ # GH 7559
+ # from the vbench
+ df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
+ s = df[1]
+ g = df[0]
+ expected = s.groupby(g).first()
+ expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
+ assert_series_equal(expected2, expected, check_names=False)
+ assert expected.name == 1
+ assert expected2.name == 1
+
+ # validate first
+ v = s[g == 1].iloc[0]
+ assert expected.iloc[0] == v
+ assert expected2.iloc[0] == v
+
+ # this is NOT the same as .first (as sorted is default!)
+ # as it keeps the order in the series (and not the group order)
+ # related GH 7287
+ expected = s.groupby(g, sort=False).first()
+ result = s.groupby(g, sort=False).nth(0, dropna='all')
+ assert_series_equal(result, expected)
+
+ # doc example
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+ g = df.groupby('A')
+ # PR 17493, related to issue 11038
+ # test Series.nth with True for dropna produces FutureWarning
+ with assert_produces_warning(FutureWarning):
+ result = g.B.nth(0, dropna=True)
+ expected = g.B.first()
+ assert_series_equal(result, expected)
+
+ # test multiple nth values
+ df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
+ columns=['A', 'B'])
+ g = df.groupby('A')
+
+ assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
+ assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
+ assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
+ assert_frame_equal(
+ g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
+ assert_frame_equal(
+ g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
+ assert_frame_equal(
+ g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
+ assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
+ assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
+
+ business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
+ freq='B')
+ df = DataFrame(1, index=business_dates, columns=['a', 'b'])
+ # get the first, fourth and last two business days for each month
+ key = [df.index.year, df.index.month]
+ result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
+ expected_dates = pd.to_datetime(
+ ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
+ '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
+ '2014/6/27', '2014/6/30'])
+ expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
+ assert_frame_equal(result, expected)
+
+
+def test_nth_multi_index(three_group):
+ # PR 9090, related to issue 8979
+ # test nth on MultiIndex, should match .first()
+ grouped = three_group.groupby(['A', 'B'])
+ result = grouped.nth(0)
+ expected = grouped.first()
+ assert_frame_equal(result, expected)
+
+
+def test_nth_multi_index_as_expected():
+ # PR 9090, related to issue 8979
+ # test nth on MultiIndex
+ three_group = DataFrame(
+ {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
+ 'foo', 'foo', 'foo'],
+ 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
+ 'two', 'two', 'one'],
+ 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
+ 'dull', 'shiny', 'shiny', 'shiny']})
+ grouped = three_group.groupby(['A', 'B'])
+ result = grouped.nth(0)
+ expected = DataFrame(
+ {'C': ['dull', 'dull', 'dull', 'dull']},
+ index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
+ ['one', 'two', 'one', 'two']],
+ names=['A', 'B']))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_head_tail():
+ df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+ g_as = df.groupby('A', as_index=True)
+ g_not_as = df.groupby('A', as_index=False)
+
+ # as_index= False, much easier
+ assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
+ assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
+
+ empty_not_as = DataFrame(columns=df.columns,
+ index=pd.Index([], dtype=df.index.dtype))
+ empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype)
+ empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype)
+ assert_frame_equal(empty_not_as, g_not_as.head(0))
+ assert_frame_equal(empty_not_as, g_not_as.tail(0))
+ assert_frame_equal(empty_not_as, g_not_as.head(-1))
+ assert_frame_equal(empty_not_as, g_not_as.tail(-1))
+
+ assert_frame_equal(df, g_not_as.head(7)) # contains all
+ assert_frame_equal(df, g_not_as.tail(7))
+
+ # as_index=True, (used to be different)
+ df_as = df
+
+ assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
+ assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
+
+ empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
+ empty_as['A'] = empty_not_as['A'].astype(df.A.dtype)
+ empty_as['B'] = empty_not_as['B'].astype(df.B.dtype)
+ assert_frame_equal(empty_as, g_as.head(0))
+ assert_frame_equal(empty_as, g_as.tail(0))
+ assert_frame_equal(empty_as, g_as.head(-1))
+ assert_frame_equal(empty_as, g_as.tail(-1))
+
+ assert_frame_equal(df_as, g_as.head(7)) # contains all
+ assert_frame_equal(df_as, g_as.tail(7))
+
+ # test with selection
+ assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
+ assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
+ assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
+ assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
+
+ assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
+ assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
+ assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
+ assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
+
+
+def test_group_selection_cache():
+ # GH 12839 nth, head, and tail should return same result consistently
+ df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+ expected = df.iloc[[0, 2]].set_index('A')
+
+ g = df.groupby('A')
+ result1 = g.head(n=2)
+ result2 = g.nth(0)
+ assert_frame_equal(result1, df)
+ assert_frame_equal(result2, expected)
+
+ g = df.groupby('A')
+ result1 = g.tail(n=2)
+ result2 = g.nth(0)
+ assert_frame_equal(result1, df)
+ assert_frame_equal(result2, expected)
+
+ g = df.groupby('A')
+ result1 = g.nth(0)
+ result2 = g.head(n=2)
+ assert_frame_equal(result1, expected)
+ assert_frame_equal(result2, df)
+
+ g = df.groupby('A')
+ result1 = g.nth(0)
+ result2 = g.tail(n=2)
+ assert_frame_equal(result1, expected)
+ assert_frame_equal(result2, df)
def test_nth_empty():
diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py
new file mode 100644
index 0000000000000..6ad8b4905abff
--- /dev/null
+++ b/pandas/tests/groupby/test_rank.py
@@ -0,0 +1,254 @@
+import pytest
+import numpy as np
+import pandas as pd
+from pandas import DataFrame, concat
+from pandas.util import testing as tm
+
+
+def test_rank_apply():
+ lev1 = tm.rands_array(10, 100)
+ lev2 = tm.rands_array(10, 130)
+ lab1 = np.random.randint(0, 100, size=500)
+ lab2 = np.random.randint(0, 130, size=500)
+
+ df = DataFrame({'value': np.random.randn(500),
+ 'key1': lev1.take(lab1),
+ 'key2': lev2.take(lab2)})
+
+ result = df.groupby(['key1', 'key2']).value.rank()
+
+ expected = []
+ for key, piece in df.groupby(['key1', 'key2']):
+ expected.append(piece.value.rank())
+ expected = concat(expected, axis=0)
+ expected = expected.reindex(result.index)
+ tm.assert_series_equal(result, expected)
+
+ result = df.groupby(['key1', 'key2']).value.rank(pct=True)
+
+ expected = []
+ for key, piece in df.groupby(['key1', 'key2']):
+ expected.append(piece.value.rank(pct=True))
+ expected = concat(expected, axis=0)
+ expected = expected.reindex(result.index)
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("grps", [
+ ['qux'], ['qux', 'quux']])
+@pytest.mark.parametrize("vals", [
+ [2, 2, 8, 2, 6],
+ [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'),
+ pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
+ pd.Timestamp('2018-01-06')]])
+@pytest.mark.parametrize("ties_method,ascending,pct,exp", [
+ ('average', True, False, [2., 2., 5., 2., 4.]),
+ ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
+ ('average', False, False, [4., 4., 1., 4., 2.]),
+ ('average', False, True, [.8, .8, .2, .8, .4]),
+ ('min', True, False, [1., 1., 5., 1., 4.]),
+ ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
+ ('min', False, False, [3., 3., 1., 3., 2.]),
+ ('min', False, True, [.6, .6, .2, .6, .4]),
+ ('max', True, False, [3., 3., 5., 3., 4.]),
+ ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
+ ('max', False, False, [5., 5., 1., 5., 2.]),
+ ('max', False, True, [1., 1., .2, 1., .4]),
+ ('first', True, False, [1., 2., 5., 3., 4.]),
+ ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
+ ('first', False, False, [3., 4., 1., 5., 2.]),
+ ('first', False, True, [.6, .8, .2, 1., .4]),
+ ('dense', True, False, [1., 1., 3., 1., 2.]),
+ ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]),
+ ('dense', False, False, [3., 3., 1., 3., 2.]),
+ ('dense', False, True, [.6, .6, .2, .6, .4]),
+])
+def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
+ key = np.repeat(grps, len(vals))
+ vals = vals * len(grps)
+ df = DataFrame({'key': key, 'val': vals})
+ result = df.groupby('key').rank(method=ties_method,
+ ascending=ascending, pct=pct)
+
+ exp_df = DataFrame(exp * len(grps), columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+@pytest.mark.parametrize("grps", [
+ ['qux'], ['qux', 'quux']])
+@pytest.mark.parametrize("vals", [
+ [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf],
+])
+@pytest.mark.parametrize("ties_method,ascending,na_option,exp", [
+ ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
+ ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]),
+ ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]),
+ ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
+ ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]),
+ ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]),
+ ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]),
+ ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]),
+ ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]),
+ ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]),
+ ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]),
+ ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]),
+ ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]),
+ ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]),
+ ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]),
+ ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]),
+ ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]),
+ ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]),
+ ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]),
+ ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]),
+ ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]),
+ ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]),
+ ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]),
+ ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]),
+ ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]),
+ ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]),
+ ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]),
+ ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]),
+ ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]),
+ ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.])
+])
+def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
+ # GH 20561
+ key = np.repeat(grps, len(vals))
+ vals = vals * len(grps)
+ df = DataFrame({'key': key, 'val': vals})
+ result = df.groupby('key').rank(method=ties_method,
+ ascending=ascending,
+ na_option=na_option)
+ exp_df = DataFrame(exp * len(grps), columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+@pytest.mark.parametrize("grps", [
+ ['qux'], ['qux', 'quux']])
+@pytest.mark.parametrize("vals", [
+ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats
+ [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
+ pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
+ pd.Timestamp('2018-01-06'), np.nan, np.nan]
+])
+@pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [
+ ('average', True, 'keep', False,
+ [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]),
+ ('average', True, 'keep', True,
+ [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]),
+ ('average', False, 'keep', False,
+ [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]),
+ ('average', False, 'keep', True,
+ [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]),
+ ('min', True, 'keep', False,
+ [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]),
+ ('min', True, 'keep', True,
+ [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
+ ('min', False, 'keep', False,
+ [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
+ ('min', False, 'keep', True,
+ [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
+ ('max', True, 'keep', False,
+ [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]),
+ ('max', True, 'keep', True,
+ [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
+ ('max', False, 'keep', False,
+ [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]),
+ ('max', False, 'keep', True,
+ [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
+ ('first', True, 'keep', False,
+ [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]),
+ ('first', True, 'keep', True,
+ [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
+ ('first', False, 'keep', False,
+ [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]),
+ ('first', False, 'keep', True,
+ [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
+ ('dense', True, 'keep', False,
+ [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
+ ('dense', True, 'keep', True,
+ [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]),
+ ('dense', False, 'keep', False,
+ [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
+ ('dense', False, 'keep', True,
+ [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
+ ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
+ ('average', True, 'no_na', True,
+ [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
+ ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]),
+ ('average', False, 'no_na', True,
+ [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]),
+ ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]),
+ ('min', True, 'no_na', True,
+ [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]),
+ ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]),
+ ('min', False, 'no_na', True,
+ [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]),
+ ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]),
+ ('max', True, 'no_na', True,
+ [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]),
+ ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]),
+ ('max', False, 'no_na', True,
+ [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]),
+ ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]),
+ ('first', True, 'no_na', True,
+ [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]),
+ ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]),
+ ('first', False, 'no_na', True,
+ [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
+ ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
+ ('dense', True, 'no_na', True,
+ [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]),
+ ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
+ ('dense', False, 'no_na', True,
+ [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5])
+])
+def test_rank_args_missing(grps, vals, ties_method, ascending,
+ na_option, pct, exp):
+ key = np.repeat(grps, len(vals))
+ vals = vals * len(grps)
+ df = DataFrame({'key': key, 'val': vals})
+ result = df.groupby('key').rank(method=ties_method,
+ ascending=ascending,
+ na_option=na_option, pct=pct)
+
+ exp_df = DataFrame(exp * len(grps), columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+@pytest.mark.parametrize("pct,exp", [
+ (False, [3., 3., 3., 3., 3.]),
+ (True, [.6, .6, .6, .6, .6])])
+def test_rank_resets_each_group(pct, exp):
+ df = DataFrame(
+ {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'],
+ 'val': [1] * 10}
+ )
+ result = df.groupby('key').rank(pct=pct)
+ exp_df = DataFrame(exp * 2, columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+def test_rank_avg_even_vals():
+ df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4})
+ result = df.groupby('key').rank()
+ exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val'])
+ tm.assert_frame_equal(result, exp_df)
+
+
+@pytest.mark.parametrize("ties_method", [
+ 'average', 'min', 'max', 'first', 'dense'])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
+@pytest.mark.parametrize("pct", [True, False])
+@pytest.mark.parametrize("vals", [
+ ['bar', 'bar', 'foo', 'bar', 'baz'],
+ ['bar', np.nan, 'foo', np.nan, 'baz']
+])
+def test_rank_object_raises(ties_method, ascending, na_option,
+ pct, vals):
+ df = DataFrame({'key': ['foo'] * 5, 'val': vals})
+ with tm.assert_raises_regex(TypeError, "not callable"):
+ df.groupby('key').rank(method=ties_method,
+ ascending=ascending,
+ na_option=na_option, pct=pct)
diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
index 390b99d0fab1c..626057c1ea760 100644
--- a/pandas/tests/groupby/test_transform.py
+++ b/pandas/tests/groupby/test_transform.py
@@ -10,728 +10,758 @@
_ensure_platform_int, is_timedelta64_dtype)
from pandas.compat import StringIO
from pandas._libs import groupby
-from .common import MixIn, assert_fp_equal
from pandas.util.testing import assert_frame_equal, assert_series_equal
from pandas.core.groupby.groupby import DataError
from pandas.core.config import option_context
-class TestGroupBy(MixIn):
-
- def test_transform(self):
- data = Series(np.arange(9) // 3, index=np.arange(9))
-
- index = np.arange(9)
- np.random.shuffle(index)
- data = data.reindex(index)
-
- grouped = data.groupby(lambda x: x // 3)
-
- transformed = grouped.transform(lambda x: x * x.sum())
- assert transformed[7] == 12
-
- # GH 8046
- # make sure that we preserve the input order
-
- df = DataFrame(
- np.arange(6, dtype='int64').reshape(
- 3, 2), columns=["a", "b"], index=[0, 2, 1])
- key = [0, 0, 1]
- expected = df.sort_index().groupby(key).transform(
- lambda x: x - x.mean()).groupby(key).mean()
- result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
- key).mean()
- assert_frame_equal(result, expected)
-
- def demean(arr):
- return arr - arr.mean()
-
- people = DataFrame(np.random.randn(5, 5),
- columns=['a', 'b', 'c', 'd', 'e'],
- index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
- key = ['one', 'two', 'one', 'two', 'one']
- result = people.groupby(key).transform(demean).groupby(key).mean()
- expected = people.groupby(key).apply(demean).groupby(key).mean()
- assert_frame_equal(result, expected)
-
- # GH 8430
- df = tm.makeTimeDataFrame()
- g = df.groupby(pd.Grouper(freq='M'))
- g.transform(lambda x: x - 1)
-
- # GH 9700
- df = DataFrame({'a': range(5, 10), 'b': range(5)})
- result = df.groupby('a').transform(max)
- expected = DataFrame({'b': range(5)})
- tm.assert_frame_equal(result, expected)
-
- def test_transform_fast(self):
-
- df = DataFrame({'id': np.arange(100000) / 3,
- 'val': np.random.randn(100000)})
-
- grp = df.groupby('id')['val']
-
- values = np.repeat(grp.mean().values,
- _ensure_platform_int(grp.count().values))
- expected = pd.Series(values, index=df.index, name='val')
-
- result = grp.transform(np.mean)
- assert_series_equal(result, expected)
-
- result = grp.transform('mean')
- assert_series_equal(result, expected)
-
- # GH 12737
- df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
- 'd': pd.date_range('2014-1-1', '2014-1-4'),
- 'i': [1, 2, 3, 4]},
- columns=['grouping', 'f', 'i', 'd'])
- result = df.groupby('grouping').transform('first')
-
- dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
- pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
- expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
- 'd': dates,
- 'i': [1, 2, 2, 4]},
- columns=['f', 'i', 'd'])
- assert_frame_equal(result, expected)
-
- # selection
- result = df.groupby('grouping')[['f', 'i']].transform('first')
- expected = expected[['f', 'i']]
- assert_frame_equal(result, expected)
-
- # dup columns
- df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
- result = df.groupby('g').transform('first')
- expected = df.drop('g', axis=1)
- assert_frame_equal(result, expected)
-
- def test_transform_broadcast(self):
- grouped = self.ts.groupby(lambda x: x.month)
- result = grouped.transform(np.mean)
-
- tm.assert_index_equal(result.index, self.ts.index)
- for _, gp in grouped:
- assert_fp_equal(result.reindex(gp.index), gp.mean())
-
- grouped = self.tsframe.groupby(lambda x: x.month)
- result = grouped.transform(np.mean)
- tm.assert_index_equal(result.index, self.tsframe.index)
- for _, gp in grouped:
- agged = gp.mean()
- res = result.reindex(gp.index)
- for col in self.tsframe:
- assert_fp_equal(res[col], agged[col])
-
- # group columns
- grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
- axis=1)
- result = grouped.transform(np.mean)
- tm.assert_index_equal(result.index, self.tsframe.index)
- tm.assert_index_equal(result.columns, self.tsframe.columns)
- for _, gp in grouped:
- agged = gp.mean(1)
- res = result.reindex(columns=gp.columns)
- for idx in gp.index:
- assert_fp_equal(res.xs(idx), agged[idx])
-
- def test_transform_axis(self):
-
- # make sure that we are setting the axes
- # correctly when on axis=0 or 1
- # in the presence of a non-monotonic indexer
- # GH12713
-
- base = self.tsframe.iloc[0:5]
- r = len(base.index)
- c = len(base.columns)
- tso = DataFrame(np.random.randn(r, c),
- index=base.index,
- columns=base.columns,
- dtype='float64')
- # monotonic
- ts = tso
- grouped = ts.groupby(lambda x: x.weekday())
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: x - x.mean())
- assert_frame_equal(result, expected)
-
- ts = ts.T
- grouped = ts.groupby(lambda x: x.weekday(), axis=1)
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
- assert_frame_equal(result, expected)
-
- # non-monotonic
- ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
- grouped = ts.groupby(lambda x: x.weekday())
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: x - x.mean())
- assert_frame_equal(result, expected)
-
- ts = ts.T
- grouped = ts.groupby(lambda x: x.weekday(), axis=1)
- result = ts - grouped.transform('mean')
- expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
- assert_frame_equal(result, expected)
-
- def test_transform_dtype(self):
- # GH 9807
- # Check transform dtype output is preserved
- df = DataFrame([[1, 3], [2, 3]])
- result = df.groupby(1).transform('mean')
- expected = DataFrame([[1.5], [1.5]])
- assert_frame_equal(result, expected)
-
- def test_transform_bug(self):
- # GH 5712
- # transforming on a datetime column
- df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
- result = df.groupby('A')['B'].transform(
- lambda x: x.rank(ascending=False))
- expected = Series(np.arange(5, 0, step=-1), name='B')
- assert_series_equal(result, expected)
-
- def test_transform_numeric_to_boolean(self):
- # GH 16875
- # inconsistency in transforming boolean values
- expected = pd.Series([True, True], name='A')
-
- df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]})
- result = df.groupby('B').A.transform(lambda x: True)
- assert_series_equal(result, expected)
-
- df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]})
- result = df.groupby('B').A.transform(lambda x: True)
- assert_series_equal(result, expected)
-
- def test_transform_datetime_to_timedelta(self):
- # GH 15429
- # transforming a datetime to timedelta
- df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
- expected = pd.Series([
- Timestamp('20130101') - Timestamp('20130101')] * 5, name='A')
-
- # this does date math without changing result type in transform
- base_time = df['A'][0]
- result = df.groupby('A')['A'].transform(
- lambda x: x.max() - x.min() + base_time) - base_time
- assert_series_equal(result, expected)
-
- # this does date math and causes the transform to return timedelta
- result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min())
- assert_series_equal(result, expected)
-
- def test_transform_datetime_to_numeric(self):
- # GH 10972
- # convert dt to float
- df = DataFrame({
- 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
- result = df.groupby('a').b.transform(
- lambda x: x.dt.dayofweek - x.dt.dayofweek.mean())
-
- expected = Series([-0.5, 0.5], name='b')
- assert_series_equal(result, expected)
-
- # convert dt to int
- df = DataFrame({
- 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
- result = df.groupby('a').b.transform(
- lambda x: x.dt.dayofweek - x.dt.dayofweek.min())
-
- expected = Series([0, 1], name='b')
- assert_series_equal(result, expected)
-
- def test_transform_casting(self):
- # 13046
- data = """
- idx A ID3 DATETIME
- 0 B-028 b76cd912ff "2014-10-08 13:43:27"
- 1 B-054 4a57ed0b02 "2014-10-08 14:26:19"
- 2 B-076 1a682034f8 "2014-10-08 14:29:01"
- 3 B-023 b76cd912ff "2014-10-08 18:39:34"
- 4 B-023 f88g8d7sds "2014-10-08 18:40:18"
- 5 B-033 b76cd912ff "2014-10-08 18:44:30"
- 6 B-032 b76cd912ff "2014-10-08 18:46:00"
- 7 B-037 b76cd912ff "2014-10-08 18:52:15"
- 8 B-046 db959faf02 "2014-10-08 18:59:59"
- 9 B-053 b76cd912ff "2014-10-08 19:17:48"
- 10 B-065 b76cd912ff "2014-10-08 19:21:38"
- """
- df = pd.read_csv(StringIO(data), sep=r'\s+',
- index_col=[0], parse_dates=['DATETIME'])
-
- result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff())
- assert is_timedelta64_dtype(result.dtype)
-
- result = df[['ID3', 'DATETIME']].groupby('ID3').transform(
- lambda x: x.diff())
- assert is_timedelta64_dtype(result.DATETIME.dtype)
-
- def test_transform_multiple(self):
- grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month])
-
- grouped.transform(lambda x: x * 2)
- grouped.transform(np.mean)
-
- def test_dispatch_transform(self):
- df = self.tsframe[::5].reindex(self.tsframe.index)
-
- grouped = df.groupby(lambda x: x.month)
-
- filled = grouped.fillna(method='pad')
- fillit = lambda x: x.fillna(method='pad')
- expected = df.groupby(lambda x: x.month).transform(fillit)
- assert_frame_equal(filled, expected)
-
- def test_transform_select_columns(self):
- f = lambda x: x.mean()
- result = self.df.groupby('A')['C', 'D'].transform(f)
-
- selection = self.df[['C', 'D']]
- expected = selection.groupby(self.df['A']).transform(f)
-
- assert_frame_equal(result, expected)
-
- def test_transform_exclude_nuisance(self):
-
- # this also tests orderings in transform between
- # series/frame to make sure it's consistent
- expected = {}
- grouped = self.df.groupby('A')
- expected['C'] = grouped['C'].transform(np.mean)
- expected['D'] = grouped['D'].transform(np.mean)
- expected = DataFrame(expected)
- result = self.df.groupby('A').transform(np.mean)
-
- assert_frame_equal(result, expected)
-
- def test_transform_function_aliases(self):
- result = self.df.groupby('A').transform('mean')
- expected = self.df.groupby('A').transform(np.mean)
- assert_frame_equal(result, expected)
-
- result = self.df.groupby('A')['C'].transform('mean')
- expected = self.df.groupby('A')['C'].transform(np.mean)
- assert_series_equal(result, expected)
-
- def test_series_fast_transform_date(self):
- # GH 13191
- df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
- 'd': pd.date_range('2014-1-1', '2014-1-4')})
- result = df.groupby('grouping')['d'].transform('first')
- dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
- pd.Timestamp('2014-1-4')]
- expected = pd.Series(dates, name='d')
- assert_series_equal(result, expected)
-
- def test_transform_length(self):
- # GH 9697
- df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
- expected = pd.Series([3.0] * 4)
-
- def nsum(x):
- return np.nansum(x)
-
- results = [df.groupby('col1').transform(sum)['col2'],
- df.groupby('col1')['col2'].transform(sum),
- df.groupby('col1').transform(nsum)['col2'],
- df.groupby('col1')['col2'].transform(nsum)]
- for result in results:
- assert_series_equal(result, expected, check_names=False)
-
- def test_transform_coercion(self):
-
- # 14457
- # when we are transforming be sure to not coerce
- # via assignment
- df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1]))
- g = df.groupby('A')
-
- expected = g.transform(np.mean)
- result = g.transform(lambda x: np.mean(x))
- assert_frame_equal(result, expected)
-
- def test_groupby_transform_with_int(self):
-
- # GH 3740, make sure that we might upcast on item-by-item transform
-
- # floats
- df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'),
- C=Series(
- [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo'))
- with np.errstate(all='ignore'):
- result = df.groupby('A').transform(
- lambda x: (x - x.mean()) / x.std())
- expected = DataFrame(dict(B=np.nan, C=Series(
- [-1, 0, 1, -1, 0, 1], dtype='float64')))
- assert_frame_equal(result, expected)
-
- # int case
- df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1,
- C=[1, 2, 3, 1, 2, 3], D='foo'))
- with np.errstate(all='ignore'):
- result = df.groupby('A').transform(
- lambda x: (x - x.mean()) / x.std())
- expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
- assert_frame_equal(result, expected)
-
- # int that needs float conversion
- s = Series([2, 3, 4, 10, 5, -1])
- df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo'))
- with np.errstate(all='ignore'):
- result = df.groupby('A').transform(
- lambda x: (x - x.mean()) / x.std())
-
- s1 = s.iloc[0:3]
- s1 = (s1 - s1.mean()) / s1.std()
- s2 = s.iloc[3:6]
- s2 = (s2 - s2.mean()) / s2.std()
- expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
- assert_frame_equal(result, expected)
-
- # int downcasting
- result = df.groupby('A').transform(lambda x: x * 2 / 2)
- expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
- assert_frame_equal(result, expected)
-
- def test_groupby_transform_with_nan_group(self):
- # GH 9941
- df = pd.DataFrame({'a': range(10),
- 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
- result = df.groupby(df.b)['a'].transform(max)
- expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.],
- name='a')
- assert_series_equal(result, expected)
-
- def test_transform_mixed_type(self):
- index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
- ])
- df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
- 'c': np.tile(['a', 'b', 'c'], 2),
- 'v': np.arange(1., 7.)}, index=index)
-
- def f(group):
- group['g'] = group['d'] * 2
- return group[:1]
-
- grouped = df.groupby('c')
- result = grouped.apply(f)
-
- assert result['d'].dtype == np.float64
-
- # this is by definition a mutating operation!
- with option_context('mode.chained_assignment', None):
- for key, group in grouped:
- res = f(group)
- assert_frame_equal(res, result.loc[key])
-
- def test_cython_group_transform_algos(self):
- # GH 4095
- dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
- np.uint64, np.float32, np.float64]
-
- ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]),
- (groupby.group_cumsum, np.cumsum, dtypes)]
-
- is_datetimelike = False
- for pd_op, np_op, dtypes in ops:
- for dtype in dtypes:
- data = np.array([[1], [2], [3], [4]], dtype=dtype)
- ans = np.zeros_like(data)
- labels = np.array([0, 0, 0, 0], dtype=np.int64)
- pd_op(ans, data, labels, is_datetimelike)
- tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
- check_dtype=False)
-
- # with nans
- labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
-
- data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
- actual = np.zeros_like(data)
- actual.fill(np.nan)
- groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
- expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
- tm.assert_numpy_array_equal(actual[:, 0], expected)
-
- actual = np.zeros_like(data)
- actual.fill(np.nan)
- groupby.group_cumsum(actual, data, labels, is_datetimelike)
- expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
- tm.assert_numpy_array_equal(actual[:, 0], expected)
-
- # timedelta
- is_datetimelike = True
- data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
- actual = np.zeros_like(data, dtype='int64')
- groupby.group_cumsum(actual, data.view('int64'), labels,
- is_datetimelike)
- expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
- 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
- np.timedelta64(5, 'ns')])
- tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected)
-
- @pytest.mark.parametrize(
- "op, args, targop",
- [('cumprod', (), lambda x: x.cumprod()),
- ('cumsum', (), lambda x: x.cumsum()),
- ('shift', (-1, ), lambda x: x.shift(-1)),
- ('shift', (1, ), lambda x: x.shift())])
- def test_cython_transform_series(self, op, args, targop):
- # GH 4095
- s = Series(np.random.randn(1000))
- s_missing = s.copy()
- s_missing.iloc[2:10] = np.nan
- labels = np.random.randint(0, 50, size=1000).astype(float)
-
- # series
- for data in [s, s_missing]:
- # print(data.head())
- expected = data.groupby(labels).transform(targop)
-
- tm.assert_series_equal(
+def assert_fp_equal(a, b):
+ assert (np.abs(a - b) < 1e-12).all()
+
+
+def test_transform():
+ data = Series(np.arange(9) // 3, index=np.arange(9))
+
+ index = np.arange(9)
+ np.random.shuffle(index)
+ data = data.reindex(index)
+
+ grouped = data.groupby(lambda x: x // 3)
+
+ transformed = grouped.transform(lambda x: x * x.sum())
+ assert transformed[7] == 12
+
+ # GH 8046
+ # make sure that we preserve the input order
+
+ df = DataFrame(
+ np.arange(6, dtype='int64').reshape(
+ 3, 2), columns=["a", "b"], index=[0, 2, 1])
+ key = [0, 0, 1]
+ expected = df.sort_index().groupby(key).transform(
+ lambda x: x - x.mean()).groupby(key).mean()
+ result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
+ key).mean()
+ assert_frame_equal(result, expected)
+
+ def demean(arr):
+ return arr - arr.mean()
+
+ people = DataFrame(np.random.randn(5, 5),
+ columns=['a', 'b', 'c', 'd', 'e'],
+ index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
+ key = ['one', 'two', 'one', 'two', 'one']
+ result = people.groupby(key).transform(demean).groupby(key).mean()
+ expected = people.groupby(key).apply(demean).groupby(key).mean()
+ assert_frame_equal(result, expected)
+
+ # GH 8430
+ df = tm.makeTimeDataFrame()
+ g = df.groupby(pd.Grouper(freq='M'))
+ g.transform(lambda x: x - 1)
+
+ # GH 9700
+ df = DataFrame({'a': range(5, 10), 'b': range(5)})
+ result = df.groupby('a').transform(max)
+ expected = DataFrame({'b': range(5)})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_transform_fast():
+
+ df = DataFrame({'id': np.arange(100000) / 3,
+ 'val': np.random.randn(100000)})
+
+ grp = df.groupby('id')['val']
+
+ values = np.repeat(grp.mean().values,
+ _ensure_platform_int(grp.count().values))
+ expected = pd.Series(values, index=df.index, name='val')
+
+ result = grp.transform(np.mean)
+ assert_series_equal(result, expected)
+
+ result = grp.transform('mean')
+ assert_series_equal(result, expected)
+
+ # GH 12737
+ df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
+ 'd': pd.date_range('2014-1-1', '2014-1-4'),
+ 'i': [1, 2, 3, 4]},
+ columns=['grouping', 'f', 'i', 'd'])
+ result = df.groupby('grouping').transform('first')
+
+ dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
+ pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
+ expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
+ 'd': dates,
+ 'i': [1, 2, 2, 4]},
+ columns=['f', 'i', 'd'])
+ assert_frame_equal(result, expected)
+
+ # selection
+ result = df.groupby('grouping')[['f', 'i']].transform('first')
+ expected = expected[['f', 'i']]
+ assert_frame_equal(result, expected)
+
+ # dup columns
+ df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
+ result = df.groupby('g').transform('first')
+ expected = df.drop('g', axis=1)
+ assert_frame_equal(result, expected)
+
+
+def test_transform_broadcast(tsframe, ts):
+ grouped = ts.groupby(lambda x: x.month)
+ result = grouped.transform(np.mean)
+
+ tm.assert_index_equal(result.index, ts.index)
+ for _, gp in grouped:
+ assert_fp_equal(result.reindex(gp.index), gp.mean())
+
+ grouped = tsframe.groupby(lambda x: x.month)
+ result = grouped.transform(np.mean)
+ tm.assert_index_equal(result.index, tsframe.index)
+ for _, gp in grouped:
+ agged = gp.mean()
+ res = result.reindex(gp.index)
+ for col in tsframe:
+ assert_fp_equal(res[col], agged[col])
+
+ # group columns
+ grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
+ axis=1)
+ result = grouped.transform(np.mean)
+ tm.assert_index_equal(result.index, tsframe.index)
+ tm.assert_index_equal(result.columns, tsframe.columns)
+ for _, gp in grouped:
+ agged = gp.mean(1)
+ res = result.reindex(columns=gp.columns)
+ for idx in gp.index:
+ assert_fp_equal(res.xs(idx), agged[idx])
+
+
+def test_transform_axis(tsframe):
+
+ # make sure that we are setting the axes
+ # correctly when on axis=0 or 1
+ # in the presence of a non-monotonic indexer
+ # GH12713
+
+ base = tsframe.iloc[0:5]
+ r = len(base.index)
+ c = len(base.columns)
+ tso = DataFrame(np.random.randn(r, c),
+ index=base.index,
+ columns=base.columns,
+ dtype='float64')
+ # monotonic
+ ts = tso
+ grouped = ts.groupby(lambda x: x.weekday())
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: x - x.mean())
+ assert_frame_equal(result, expected)
+
+ ts = ts.T
+ grouped = ts.groupby(lambda x: x.weekday(), axis=1)
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
+ assert_frame_equal(result, expected)
+
+ # non-monotonic
+ ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
+ grouped = ts.groupby(lambda x: x.weekday())
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: x - x.mean())
+ assert_frame_equal(result, expected)
+
+ ts = ts.T
+ grouped = ts.groupby(lambda x: x.weekday(), axis=1)
+ result = ts - grouped.transform('mean')
+ expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
+ assert_frame_equal(result, expected)
+
+
+def test_transform_dtype():
+ # GH 9807
+ # Check transform dtype output is preserved
+ df = DataFrame([[1, 3], [2, 3]])
+ result = df.groupby(1).transform('mean')
+ expected = DataFrame([[1.5], [1.5]])
+ assert_frame_equal(result, expected)
+
+
+def test_transform_bug():
+ # GH 5712
+ # transforming on a datetime column
+ df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
+ result = df.groupby('A')['B'].transform(
+ lambda x: x.rank(ascending=False))
+ expected = Series(np.arange(5, 0, step=-1), name='B')
+ assert_series_equal(result, expected)
+
+
+def test_transform_numeric_to_boolean():
+ # GH 16875
+ # inconsistency in transforming boolean values
+ expected = pd.Series([True, True], name='A')
+
+ df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]})
+ result = df.groupby('B').A.transform(lambda x: True)
+ assert_series_equal(result, expected)
+
+ df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]})
+ result = df.groupby('B').A.transform(lambda x: True)
+ assert_series_equal(result, expected)
+
+
+def test_transform_datetime_to_timedelta():
+ # GH 15429
+ # transforming a datetime to timedelta
+ df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
+ expected = pd.Series([
+ Timestamp('20130101') - Timestamp('20130101')] * 5, name='A')
+
+ # this does date math without changing result type in transform
+ base_time = df['A'][0]
+ result = df.groupby('A')['A'].transform(
+ lambda x: x.max() - x.min() + base_time) - base_time
+ assert_series_equal(result, expected)
+
+ # this does date math and causes the transform to return timedelta
+ result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min())
+ assert_series_equal(result, expected)
+
+
+def test_transform_datetime_to_numeric():
+ # GH 10972
+ # convert dt to float
+ df = DataFrame({
+ 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
+ result = df.groupby('a').b.transform(
+ lambda x: x.dt.dayofweek - x.dt.dayofweek.mean())
+
+ expected = Series([-0.5, 0.5], name='b')
+ assert_series_equal(result, expected)
+
+ # convert dt to int
+ df = DataFrame({
+ 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
+ result = df.groupby('a').b.transform(
+ lambda x: x.dt.dayofweek - x.dt.dayofweek.min())
+
+ expected = Series([0, 1], name='b')
+ assert_series_equal(result, expected)
+
+
+def test_transform_casting():
+ # 13046
+ data = """
+ idx A ID3 DATETIME
+ 0 B-028 b76cd912ff "2014-10-08 13:43:27"
+ 1 B-054 4a57ed0b02 "2014-10-08 14:26:19"
+ 2 B-076 1a682034f8 "2014-10-08 14:29:01"
+ 3 B-023 b76cd912ff "2014-10-08 18:39:34"
+ 4 B-023 f88g8d7sds "2014-10-08 18:40:18"
+ 5 B-033 b76cd912ff "2014-10-08 18:44:30"
+ 6 B-032 b76cd912ff "2014-10-08 18:46:00"
+ 7 B-037 b76cd912ff "2014-10-08 18:52:15"
+ 8 B-046 db959faf02 "2014-10-08 18:59:59"
+ 9 B-053 b76cd912ff "2014-10-08 19:17:48"
+ 10 B-065 b76cd912ff "2014-10-08 19:21:38"
+ """
+ df = pd.read_csv(StringIO(data), sep=r'\s+',
+ index_col=[0], parse_dates=['DATETIME'])
+
+ result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff())
+ assert is_timedelta64_dtype(result.dtype)
+
+ result = df[['ID3', 'DATETIME']].groupby('ID3').transform(
+ lambda x: x.diff())
+ assert is_timedelta64_dtype(result.DATETIME.dtype)
+
+
+def test_transform_multiple(ts):
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+
+ grouped.transform(lambda x: x * 2)
+ grouped.transform(np.mean)
+
+
+def test_dispatch_transform(tsframe):
+ df = tsframe[::5].reindex(tsframe.index)
+
+ grouped = df.groupby(lambda x: x.month)
+
+ filled = grouped.fillna(method='pad')
+ fillit = lambda x: x.fillna(method='pad')
+ expected = df.groupby(lambda x: x.month).transform(fillit)
+ assert_frame_equal(filled, expected)
+
+
+def test_transform_select_columns(df):
+ f = lambda x: x.mean()
+ result = df.groupby('A')['C', 'D'].transform(f)
+
+ selection = df[['C', 'D']]
+ expected = selection.groupby(df['A']).transform(f)
+
+ assert_frame_equal(result, expected)
+
+
+def test_transform_exclude_nuisance(df):
+
+ # this also tests orderings in transform between
+ # series/frame to make sure it's consistent
+ expected = {}
+ grouped = df.groupby('A')
+ expected['C'] = grouped['C'].transform(np.mean)
+ expected['D'] = grouped['D'].transform(np.mean)
+ expected = DataFrame(expected)
+ result = df.groupby('A').transform(np.mean)
+
+ assert_frame_equal(result, expected)
+
+
+def test_transform_function_aliases(df):
+ result = df.groupby('A').transform('mean')
+ expected = df.groupby('A').transform(np.mean)
+ assert_frame_equal(result, expected)
+
+ result = df.groupby('A')['C'].transform('mean')
+ expected = df.groupby('A')['C'].transform(np.mean)
+ assert_series_equal(result, expected)
+
+
+def test_series_fast_transform_date():
+ # GH 13191
+ df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
+ 'd': pd.date_range('2014-1-1', '2014-1-4')})
+ result = df.groupby('grouping')['d'].transform('first')
+ dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
+ pd.Timestamp('2014-1-4')]
+ expected = pd.Series(dates, name='d')
+ assert_series_equal(result, expected)
+
+
+def test_transform_length():
+ # GH 9697
+ df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
+ expected = pd.Series([3.0] * 4)
+
+ def nsum(x):
+ return np.nansum(x)
+
+ results = [df.groupby('col1').transform(sum)['col2'],
+ df.groupby('col1')['col2'].transform(sum),
+ df.groupby('col1').transform(nsum)['col2'],
+ df.groupby('col1')['col2'].transform(nsum)]
+ for result in results:
+ assert_series_equal(result, expected, check_names=False)
+
+
+def test_transform_coercion():
+
+ # 14457
+ # when we are transforming be sure to not coerce
+ # via assignment
+ df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1]))
+ g = df.groupby('A')
+
+ expected = g.transform(np.mean)
+ result = g.transform(lambda x: np.mean(x))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_transform_with_int():
+
+ # GH 3740, make sure that we might upcast on item-by-item transform
+
+ # floats
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'),
+ C=Series(
+ [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo'))
+ with np.errstate(all='ignore'):
+ result = df.groupby('A').transform(
+ lambda x: (x - x.mean()) / x.std())
+ expected = DataFrame(dict(B=np.nan, C=Series(
+ [-1, 0, 1, -1, 0, 1], dtype='float64')))
+ assert_frame_equal(result, expected)
+
+ # int case
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1,
+ C=[1, 2, 3, 1, 2, 3], D='foo'))
+ with np.errstate(all='ignore'):
+ result = df.groupby('A').transform(
+ lambda x: (x - x.mean()) / x.std())
+ expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
+ assert_frame_equal(result, expected)
+
+ # int that needs float conversion
+ s = Series([2, 3, 4, 10, 5, -1])
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo'))
+ with np.errstate(all='ignore'):
+ result = df.groupby('A').transform(
+ lambda x: (x - x.mean()) / x.std())
+
+ s1 = s.iloc[0:3]
+ s1 = (s1 - s1.mean()) / s1.std()
+ s2 = s.iloc[3:6]
+ s2 = (s2 - s2.mean()) / s2.std()
+ expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
+ assert_frame_equal(result, expected)
+
+ # int downcasting
+ result = df.groupby('A').transform(lambda x: x * 2 / 2)
+ expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
+ assert_frame_equal(result, expected)
+
+
+def test_groupby_transform_with_nan_group():
+ # GH 9941
+ df = pd.DataFrame({'a': range(10),
+ 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
+ result = df.groupby(df.b)['a'].transform(max)
+ expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.],
+ name='a')
+ assert_series_equal(result, expected)
+
+
+def test_transform_mixed_type():
+ index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
+ ])
+ df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
+ 'c': np.tile(['a', 'b', 'c'], 2),
+ 'v': np.arange(1., 7.)}, index=index)
+
+ def f(group):
+ group['g'] = group['d'] * 2
+ return group[:1]
+
+ grouped = df.groupby('c')
+ result = grouped.apply(f)
+
+ assert result['d'].dtype == np.float64
+
+ # this is by definition a mutating operation!
+ with option_context('mode.chained_assignment', None):
+ for key, group in grouped:
+ res = f(group)
+ assert_frame_equal(res, result.loc[key])
+
+
+def test_cython_group_transform_algos():
+ # GH 4095
+ dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
+ np.uint64, np.float32, np.float64]
+
+ ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]),
+ (groupby.group_cumsum, np.cumsum, dtypes)]
+
+ is_datetimelike = False
+ for pd_op, np_op, dtypes in ops:
+ for dtype in dtypes:
+ data = np.array([[1], [2], [3], [4]], dtype=dtype)
+ ans = np.zeros_like(data)
+ labels = np.array([0, 0, 0, 0], dtype=np.int64)
+ pd_op(ans, data, labels, is_datetimelike)
+ tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
+ check_dtype=False)
+
+ # with nans
+ labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
+
+ data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
+ actual = np.zeros_like(data)
+ actual.fill(np.nan)
+ groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
+ expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
+ tm.assert_numpy_array_equal(actual[:, 0], expected)
+
+ actual = np.zeros_like(data)
+ actual.fill(np.nan)
+ groupby.group_cumsum(actual, data, labels, is_datetimelike)
+ expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
+ tm.assert_numpy_array_equal(actual[:, 0], expected)
+
+ # timedelta
+ is_datetimelike = True
+ data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
+ actual = np.zeros_like(data, dtype='int64')
+ groupby.group_cumsum(actual, data.view('int64'), labels,
+ is_datetimelike)
+ expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
+ 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
+ np.timedelta64(5, 'ns')])
+ tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected)
+
+
+@pytest.mark.parametrize(
+ "op, args, targop",
+ [('cumprod', (), lambda x: x.cumprod()),
+ ('cumsum', (), lambda x: x.cumsum()),
+ ('shift', (-1, ), lambda x: x.shift(-1)),
+ ('shift', (1, ), lambda x: x.shift())])
+def test_cython_transform_series(op, args, targop):
+ # GH 4095
+ s = Series(np.random.randn(1000))
+ s_missing = s.copy()
+ s_missing.iloc[2:10] = np.nan
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+
+ # series
+ for data in [s, s_missing]:
+ # print(data.head())
+ expected = data.groupby(labels).transform(targop)
+
+ tm.assert_series_equal(
+ expected,
+ data.groupby(labels).transform(op, *args))
+ tm.assert_series_equal(expected, getattr(
+ data.groupby(labels), op)(*args))
+
+
+@pytest.mark.parametrize("op", ['cumprod', 'cumsum'])
+@pytest.mark.parametrize("skipna", [False, True])
+@pytest.mark.parametrize('input, exp', [
+ # When everything is NaN
+ ({'key': ['b'] * 10, 'value': np.nan},
+ pd.Series([np.nan] * 10, name='value')),
+ # When there is a single NaN
+ ({'key': ['b'] * 10 + ['a'] * 2,
+ 'value': [3] * 3 + [np.nan] + [3] * 8},
+ {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
+ ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729.,
+ 2187., 6561., 19683., 3.0, 9.0],
+ ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
+ ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18.,
+ 21., 24., 27., 3.0, 6.0]})])
+def test_groupby_cum_skipna(op, skipna, input, exp):
+ df = pd.DataFrame(input)
+ result = df.groupby('key')['value'].transform(op, skipna=skipna)
+ if isinstance(exp, dict):
+ expected = exp[(op, skipna)]
+ else:
+ expected = exp
+ expected = pd.Series(expected, name='value')
+ tm.assert_series_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+ "op, args, targop",
+ [('cumprod', (), lambda x: x.cumprod()),
+ ('cumsum', (), lambda x: x.cumsum()),
+ ('shift', (-1, ), lambda x: x.shift(-1)),
+ ('shift', (1, ), lambda x: x.shift())])
+def test_cython_transform_frame(op, args, targop):
+ s = Series(np.random.randn(1000))
+ s_missing = s.copy()
+ s_missing.iloc[2:10] = np.nan
+ labels = np.random.randint(0, 50, size=1000).astype(float)
+ strings = list('qwertyuiopasdfghjklz')
+ strings_missing = strings[:]
+ strings_missing[5] = np.nan
+ df = DataFrame({'float': s,
+ 'float_missing': s_missing,
+ 'int': [1, 1, 1, 1, 2] * 200,
+ 'datetime': pd.date_range('1990-1-1', periods=1000),
+ 'timedelta': pd.timedelta_range(1, freq='s',
+ periods=1000),
+ 'string': strings * 50,
+ 'string_missing': strings_missing * 50},
+ columns=['float', 'float_missing', 'int', 'datetime',
+ 'timedelta', 'string', 'string_missing'])
+ df['cat'] = df['string'].astype('category')
+
+ df2 = df.copy()
+ df2.index = pd.MultiIndex.from_product([range(100), range(10)])
+
+ # DataFrame - Single and MultiIndex,
+ # group by values, index level, columns
+ for df in [df, df2]:
+ for gb_target in [dict(by=labels), dict(level=0), dict(by='string')
+ ]: # dict(by='string_missing')]:
+ # dict(by=['int','string'])]:
+
+ gb = df.groupby(**gb_target)
+ # whitelisted methods set the selection before applying
+ # bit a of hack to make sure the cythonized shift
+ # is equivalent to pre 0.17.1 behavior
+ if op == 'shift':
+ gb._set_group_selection()
+
+ if op != 'shift' and 'int' not in gb_target:
+ # numeric apply fastpath promotes dtype so have
+ # to apply separately and concat
+ i = gb[['int']].apply(targop)
+ f = gb[['float', 'float_missing']].apply(targop)
+ expected = pd.concat([f, i], axis=1)
+ else:
+ expected = gb.apply(targop)
+
+ expected = expected.sort_index(axis=1)
+ tm.assert_frame_equal(expected,
+ gb.transform(op, *args).sort_index(
+ axis=1))
+ tm.assert_frame_equal(
expected,
- data.groupby(labels).transform(op, *args))
- tm.assert_series_equal(expected, getattr(
- data.groupby(labels), op)(*args))
-
- @pytest.mark.parametrize("op", ['cumprod', 'cumsum'])
- @pytest.mark.parametrize("skipna", [False, True])
- @pytest.mark.parametrize('input, exp', [
- # When everything is NaN
- ({'key': ['b'] * 10, 'value': np.nan},
- pd.Series([np.nan] * 10, name='value')),
- # When there is a single NaN
- ({'key': ['b'] * 10 + ['a'] * 2,
- 'value': [3] * 3 + [np.nan] + [3] * 8},
- {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
- ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729.,
- 2187., 6561., 19683., 3.0, 9.0],
- ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
- ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18.,
- 21., 24., 27., 3.0, 6.0]})])
- def test_groupby_cum_skipna(self, op, skipna, input, exp):
- df = pd.DataFrame(input)
- result = df.groupby('key')['value'].transform(op, skipna=skipna)
- if isinstance(exp, dict):
- expected = exp[(op, skipna)]
- else:
- expected = exp
- expected = pd.Series(expected, name='value')
- tm.assert_series_equal(expected, result)
-
- @pytest.mark.parametrize(
- "op, args, targop",
- [('cumprod', (), lambda x: x.cumprod()),
- ('cumsum', (), lambda x: x.cumsum()),
- ('shift', (-1, ), lambda x: x.shift(-1)),
- ('shift', (1, ), lambda x: x.shift())])
- def test_cython_transform_frame(self, op, args, targop):
- s = Series(np.random.randn(1000))
- s_missing = s.copy()
- s_missing.iloc[2:10] = np.nan
- labels = np.random.randint(0, 50, size=1000).astype(float)
- strings = list('qwertyuiopasdfghjklz')
- strings_missing = strings[:]
- strings_missing[5] = np.nan
- df = DataFrame({'float': s,
- 'float_missing': s_missing,
- 'int': [1, 1, 1, 1, 2] * 200,
- 'datetime': pd.date_range('1990-1-1', periods=1000),
- 'timedelta': pd.timedelta_range(1, freq='s',
- periods=1000),
- 'string': strings * 50,
- 'string_missing': strings_missing * 50},
- columns=['float', 'float_missing', 'int', 'datetime',
- 'timedelta', 'string', 'string_missing'])
- df['cat'] = df['string'].astype('category')
-
- df2 = df.copy()
- df2.index = pd.MultiIndex.from_product([range(100), range(10)])
-
- # DataFrame - Single and MultiIndex,
- # group by values, index level, columns
- for df in [df, df2]:
- for gb_target in [dict(by=labels), dict(level=0), dict(by='string')
- ]: # dict(by='string_missing')]:
- # dict(by=['int','string'])]:
-
- gb = df.groupby(**gb_target)
- # whitelisted methods set the selection before applying
- # bit a of hack to make sure the cythonized shift
- # is equivalent to pre 0.17.1 behavior
- if op == 'shift':
- gb._set_group_selection()
-
- if op != 'shift' and 'int' not in gb_target:
- # numeric apply fastpath promotes dtype so have
- # to apply separately and concat
- i = gb[['int']].apply(targop)
- f = gb[['float', 'float_missing']].apply(targop)
- expected = pd.concat([f, i], axis=1)
+ getattr(gb, op)(*args).sort_index(axis=1))
+ # individual columns
+ for c in df:
+ if c not in ['float', 'int', 'float_missing'
+ ] and op != 'shift':
+ pytest.raises(DataError, gb[c].transform, op)
+ pytest.raises(DataError, getattr(gb[c], op))
else:
- expected = gb.apply(targop)
-
- expected = expected.sort_index(axis=1)
- tm.assert_frame_equal(expected,
- gb.transform(op, *args).sort_index(
- axis=1))
- tm.assert_frame_equal(
- expected,
- getattr(gb, op)(*args).sort_index(axis=1))
- # individual columns
- for c in df:
- if c not in ['float', 'int', 'float_missing'
- ] and op != 'shift':
- pytest.raises(DataError, gb[c].transform, op)
- pytest.raises(DataError, getattr(gb[c], op))
- else:
- expected = gb[c].apply(targop)
- expected.name = c
- tm.assert_series_equal(expected,
- gb[c].transform(op, *args))
- tm.assert_series_equal(expected,
- getattr(gb[c], op)(*args))
-
- def test_transform_with_non_scalar_group(self):
- # GH 10165
- cols = pd.MultiIndex.from_tuples([
- ('syn', 'A'), ('mis', 'A'), ('non', 'A'),
- ('syn', 'C'), ('mis', 'C'), ('non', 'C'),
- ('syn', 'T'), ('mis', 'T'), ('non', 'T'),
- ('syn', 'G'), ('mis', 'G'), ('non', 'G')])
- df = pd.DataFrame(np.random.randint(1, 10, (4, 12)),
- columns=cols,
- index=['A', 'C', 'G', 'T'])
- tm.assert_raises_regex(ValueError, 'transform must return '
- 'a scalar value for each '
- 'group.*',
- df.groupby(axis=1, level=1).transform,
- lambda z: z.div(z.sum(axis=1), axis=0))
-
- @pytest.mark.parametrize('cols,exp,comp_func', [
- ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal),
- (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}),
- tm.assert_frame_equal)
- ])
- @pytest.mark.parametrize('agg_func', [
- 'count', 'rank', 'size'])
- def test_transform_numeric_ret(self, cols, exp, comp_func, agg_func):
- if agg_func == 'size' and isinstance(cols, list):
- pytest.xfail("'size' transformation not supported with "
- "NDFrameGroupy")
-
- # GH 19200
- df = pd.DataFrame(
- {'a': pd.date_range('2018-01-01', periods=3),
- 'b': range(3),
- 'c': range(7, 10)})
-
- result = df.groupby('b')[cols].transform(agg_func)
-
- if agg_func == 'rank':
- exp = exp.astype('float')
-
- comp_func(result, exp)
-
- @pytest.mark.parametrize("mix_groupings", [True, False])
- @pytest.mark.parametrize("as_series", [True, False])
- @pytest.mark.parametrize("val1,val2", [
- ('foo', 'bar'), (1, 2), (1., 2.)])
- @pytest.mark.parametrize("fill_method,limit,exp_vals", [
- ("ffill", None,
- [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']),
- ("ffill", 1,
- [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]),
- ("bfill", None,
- ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]),
- ("bfill", 1,
- [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan])
- ])
- def test_group_fill_methods(self, mix_groupings, as_series, val1, val2,
- fill_method, limit, exp_vals):
- vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
- _exp_vals = list(exp_vals)
- # Overwrite placeholder values
- for index, exp_val in enumerate(_exp_vals):
- if exp_val == 'val1':
- _exp_vals[index] = val1
- elif exp_val == 'val2':
- _exp_vals[index] = val2
-
- # Need to modify values and expectations depending on the
- # Series / DataFrame that we ultimately want to generate
- if mix_groupings: # ['a', 'b', 'a, 'b', ...]
- keys = ['a', 'b'] * len(vals)
-
- def interweave(list_obj):
- temp = list()
- for x in list_obj:
- temp.extend([x, x])
-
- return temp
-
- _exp_vals = interweave(_exp_vals)
- vals = interweave(vals)
- else: # ['a', 'a', 'a', ... 'b', 'b', 'b']
- keys = ['a'] * len(vals) + ['b'] * len(vals)
- _exp_vals = _exp_vals * 2
- vals = vals * 2
-
- df = DataFrame({'key': keys, 'val': vals})
- if as_series:
- result = getattr(
- df.groupby('key')['val'], fill_method)(limit=limit)
- exp = Series(_exp_vals, name='val')
- assert_series_equal(result, exp)
- else:
- result = getattr(df.groupby('key'), fill_method)(limit=limit)
- exp = DataFrame({'key': keys, 'val': _exp_vals})
- assert_frame_equal(result, exp)
-
- @pytest.mark.parametrize("test_series", [True, False])
- @pytest.mark.parametrize("periods,fill_method,limit", [
- (1, 'ffill', None), (1, 'ffill', 1),
- (1, 'bfill', None), (1, 'bfill', 1),
- (-1, 'ffill', None), (-1, 'ffill', 1),
- (-1, 'bfill', None), (-1, 'bfill', 1)])
- def test_pct_change(self, test_series, periods, fill_method, limit):
- vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan]
- exp_vals = Series(vals).pct_change(periods=periods,
- fill_method=fill_method,
- limit=limit).tolist()
-
- df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals),
- 'vals': vals * 2})
- grp = df.groupby('key')
-
- def get_result(grp_obj):
- return grp_obj.pct_change(periods=periods,
- fill_method=fill_method,
- limit=limit)
-
- if test_series:
- exp = pd.Series(exp_vals * 2)
- exp.name = 'vals'
- grp = grp['vals']
- result = get_result(grp)
- tm.assert_series_equal(result, exp)
- else:
- exp = DataFrame({'vals': exp_vals * 2})
- result = get_result(grp)
- tm.assert_frame_equal(result, exp)
-
- @pytest.mark.parametrize("func", [np.any, np.all])
- def test_any_all_np_func(self, func):
- # GH 20653
- df = pd.DataFrame([['foo', True],
- [np.nan, True],
- ['foo', True]], columns=['key', 'val'])
-
- exp = pd.Series([True, np.nan, True], name='val')
-
- res = df.groupby('key')['val'].transform(func)
- tm.assert_series_equal(res, exp)
+ expected = gb[c].apply(targop)
+ expected.name = c
+ tm.assert_series_equal(expected,
+ gb[c].transform(op, *args))
+ tm.assert_series_equal(expected,
+ getattr(gb[c], op)(*args))
+
+
+def test_transform_with_non_scalar_group():
+ # GH 10165
+ cols = pd.MultiIndex.from_tuples([
+ ('syn', 'A'), ('mis', 'A'), ('non', 'A'),
+ ('syn', 'C'), ('mis', 'C'), ('non', 'C'),
+ ('syn', 'T'), ('mis', 'T'), ('non', 'T'),
+ ('syn', 'G'), ('mis', 'G'), ('non', 'G')])
+ df = pd.DataFrame(np.random.randint(1, 10, (4, 12)),
+ columns=cols,
+ index=['A', 'C', 'G', 'T'])
+ tm.assert_raises_regex(ValueError, 'transform must return '
+ 'a scalar value for each '
+ 'group.*',
+ df.groupby(axis=1, level=1).transform,
+ lambda z: z.div(z.sum(axis=1), axis=0))
+
+
+@pytest.mark.parametrize('cols,exp,comp_func', [
+ ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal),
+ (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}),
+ tm.assert_frame_equal)
+])
+@pytest.mark.parametrize('agg_func', [
+ 'count', 'rank', 'size'])
+def test_transform_numeric_ret(cols, exp, comp_func, agg_func):
+ if agg_func == 'size' and isinstance(cols, list):
+ pytest.xfail("'size' transformation not supported with "
+ "NDFrameGroupy")
+
+ # GH 19200
+ df = pd.DataFrame(
+ {'a': pd.date_range('2018-01-01', periods=3),
+ 'b': range(3),
+ 'c': range(7, 10)})
+
+ result = df.groupby('b')[cols].transform(agg_func)
+
+ if agg_func == 'rank':
+ exp = exp.astype('float')
+
+ comp_func(result, exp)
+
+
+@pytest.mark.parametrize("mix_groupings", [True, False])
+@pytest.mark.parametrize("as_series", [True, False])
+@pytest.mark.parametrize("val1,val2", [
+ ('foo', 'bar'), (1, 2), (1., 2.)])
+@pytest.mark.parametrize("fill_method,limit,exp_vals", [
+ ("ffill", None,
+ [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']),
+ ("ffill", 1,
+ [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]),
+ ("bfill", None,
+ ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]),
+ ("bfill", 1,
+ [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan])
+])
+def test_group_fill_methods(mix_groupings, as_series, val1, val2,
+ fill_method, limit, exp_vals):
+ vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
+ _exp_vals = list(exp_vals)
+ # Overwrite placeholder values
+ for index, exp_val in enumerate(_exp_vals):
+ if exp_val == 'val1':
+ _exp_vals[index] = val1
+ elif exp_val == 'val2':
+ _exp_vals[index] = val2
+
+ # Need to modify values and expectations depending on the
+ # Series / DataFrame that we ultimately want to generate
+ if mix_groupings: # ['a', 'b', 'a, 'b', ...]
+ keys = ['a', 'b'] * len(vals)
+
+ def interweave(list_obj):
+ temp = list()
+ for x in list_obj:
+ temp.extend([x, x])
+
+ return temp
+
+ _exp_vals = interweave(_exp_vals)
+ vals = interweave(vals)
+ else: # ['a', 'a', 'a', ... 'b', 'b', 'b']
+ keys = ['a'] * len(vals) + ['b'] * len(vals)
+ _exp_vals = _exp_vals * 2
+ vals = vals * 2
+
+ df = DataFrame({'key': keys, 'val': vals})
+ if as_series:
+ result = getattr(
+ df.groupby('key')['val'], fill_method)(limit=limit)
+ exp = Series(_exp_vals, name='val')
+ assert_series_equal(result, exp)
+ else:
+ result = getattr(df.groupby('key'), fill_method)(limit=limit)
+ exp = DataFrame({'key': keys, 'val': _exp_vals})
+ assert_frame_equal(result, exp)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+@pytest.mark.parametrize("periods,fill_method,limit", [
+ (1, 'ffill', None), (1, 'ffill', 1),
+ (1, 'bfill', None), (1, 'bfill', 1),
+ (-1, 'ffill', None), (-1, 'ffill', 1),
+ (-1, 'bfill', None), (-1, 'bfill', 1)])
+def test_pct_change(test_series, periods, fill_method, limit):
+ vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan]
+ exp_vals = Series(vals).pct_change(periods=periods,
+ fill_method=fill_method,
+ limit=limit).tolist()
+
+ df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals),
+ 'vals': vals * 2})
+ grp = df.groupby('key')
+
+ def get_result(grp_obj):
+ return grp_obj.pct_change(periods=periods,
+ fill_method=fill_method,
+ limit=limit)
+
+ if test_series:
+ exp = pd.Series(exp_vals * 2)
+ exp.name = 'vals'
+ grp = grp['vals']
+ result = get_result(grp)
+ tm.assert_series_equal(result, exp)
+ else:
+ exp = DataFrame({'vals': exp_vals * 2})
+ result = get_result(grp)
+ tm.assert_frame_equal(result, exp)
+
+
+@pytest.mark.parametrize("func", [np.any, np.all])
+def test_any_all_np_func(func):
+ # GH 20653
+ df = pd.DataFrame([['foo', True],
+ [np.nan, True],
+ ['foo', True]], columns=['key', 'val'])
+
+ exp = pd.Series([True, np.nan, True], name='val')
+
+ res = df.groupby('key')['val'].transform(func)
+ tm.assert_series_equal(res, exp)
diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py
index 1cf854ad4a926..dae69a86910af 100644
--- a/pandas/tests/indexes/datetimes/test_construction.py
+++ b/pandas/tests/indexes/datetimes/test_construction.py
@@ -598,16 +598,16 @@ def test_datetimeindex_constructor_misc(self):
idx2 = DatetimeIndex(start=sdate, end=edate,
freq=offsets.Week(weekday=6))
assert len(idx1) == len(idx2)
- assert idx1.offset == idx2.offset
+ assert idx1.freq == idx2.freq
idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS')
idx2 = DatetimeIndex(start=sdate, end=edate,
freq=offsets.QuarterBegin(startingMonth=1))
assert len(idx1) == len(idx2)
- assert idx1.offset == idx2.offset
+ assert idx1.freq == idx2.freq
idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ')
idx2 = DatetimeIndex(start=sdate, end=edate,
freq=offsets.BQuarterEnd(startingMonth=12))
assert len(idx1) == len(idx2)
- assert idx1.offset == idx2.offset
+ assert idx1.freq == idx2.freq
diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py
index d2ec465468dfb..e5291ed52a86c 100644
--- a/pandas/tests/indexes/datetimes/test_date_range.py
+++ b/pandas/tests/indexes/datetimes/test_date_range.py
@@ -236,6 +236,12 @@ def test_catch_infinite_loop(self):
pytest.raises(Exception, date_range, datetime(2011, 11, 11),
datetime(2011, 11, 12), freq=offset)
+ @pytest.mark.parametrize('periods', (1, 2))
+ def test_wom_len(self, periods):
+ # https://github.com/pandas-dev/pandas/issues/20517
+ res = date_range(start='20110101', periods=periods, freq='WOM-1MON')
+ assert len(res) == periods
+
class TestGenRangeGeneration(object):
@@ -331,21 +337,21 @@ def test_naive_aware_conflicts(self):
aware.join(naive)
def test_cached_range(self):
- DatetimeIndex._cached_range(START, END, offset=BDay())
- DatetimeIndex._cached_range(START, periods=20, offset=BDay())
- DatetimeIndex._cached_range(end=START, periods=20, offset=BDay())
+ DatetimeIndex._cached_range(START, END, freq=BDay())
+ DatetimeIndex._cached_range(START, periods=20, freq=BDay())
+ DatetimeIndex._cached_range(end=START, periods=20, freq=BDay())
- with tm.assert_raises_regex(TypeError, "offset"):
+ with tm.assert_raises_regex(TypeError, "freq"):
DatetimeIndex._cached_range(START, END)
with tm.assert_raises_regex(TypeError, "specify period"):
- DatetimeIndex._cached_range(START, offset=BDay())
+ DatetimeIndex._cached_range(START, freq=BDay())
with tm.assert_raises_regex(TypeError, "specify period"):
- DatetimeIndex._cached_range(end=END, offset=BDay())
+ DatetimeIndex._cached_range(end=END, freq=BDay())
with tm.assert_raises_regex(TypeError, "start or end"):
- DatetimeIndex._cached_range(periods=20, offset=BDay())
+ DatetimeIndex._cached_range(periods=20, freq=BDay())
def test_cached_range_bug(self):
rng = date_range('2010-09-01 05:00:00', periods=50,
@@ -393,7 +399,7 @@ def test_daterange_bug_456(self):
# GH #456
rng1 = bdate_range('12/5/2011', '12/5/2011')
rng2 = bdate_range('12/2/2011', '12/5/2011')
- rng2.offset = BDay()
+ rng2.freq = BDay()
result = rng1.union(rng2)
assert isinstance(result, DatetimeIndex)
@@ -605,27 +611,27 @@ def test_constructor(self):
bdate_range('2011-1-1', '2012-1-1', 'C')
def test_cached_range(self):
- DatetimeIndex._cached_range(START, END, offset=CDay())
+ DatetimeIndex._cached_range(START, END, freq=CDay())
DatetimeIndex._cached_range(START, periods=20,
- offset=CDay())
+ freq=CDay())
DatetimeIndex._cached_range(end=START, periods=20,
- offset=CDay())
+ freq=CDay())
# with pytest.raises(TypeError):
- with tm.assert_raises_regex(TypeError, "offset"):
+ with tm.assert_raises_regex(TypeError, "freq"):
DatetimeIndex._cached_range(START, END)
# with pytest.raises(TypeError):
with tm.assert_raises_regex(TypeError, "specify period"):
- DatetimeIndex._cached_range(START, offset=CDay())
+ DatetimeIndex._cached_range(START, freq=CDay())
# with pytest.raises(TypeError):
with tm.assert_raises_regex(TypeError, "specify period"):
- DatetimeIndex._cached_range(end=END, offset=CDay())
+ DatetimeIndex._cached_range(end=END, freq=CDay())
# with pytest.raises(TypeError):
with tm.assert_raises_regex(TypeError, "start or end"):
- DatetimeIndex._cached_range(periods=20, offset=CDay())
+ DatetimeIndex._cached_range(periods=20, freq=CDay())
def test_misc(self):
end = datetime(2009, 5, 13)
@@ -640,7 +646,7 @@ def test_daterange_bug_456(self):
# GH #456
rng1 = bdate_range('12/5/2011', '12/5/2011', freq='C')
rng2 = bdate_range('12/2/2011', '12/5/2011', freq='C')
- rng2.offset = CDay()
+ rng2.freq = CDay()
result = rng1.union(rng2)
assert isinstance(result, DatetimeIndex)
diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py
index af65a8618d30f..dd192db4b0eb3 100644
--- a/pandas/tests/indexes/datetimes/test_indexing.py
+++ b/pandas/tests/indexes/datetimes/test_indexing.py
@@ -53,10 +53,10 @@ def test_dti_business_getitem(self):
exp = DatetimeIndex(rng.view(np.ndarray)[:5])
tm.assert_index_equal(smaller, exp)
- assert smaller.offset == rng.offset
+ assert smaller.freq == rng.freq
sliced = rng[::5]
- assert sliced.offset == BDay() * 5
+ assert sliced.freq == BDay() * 5
fancy_indexed = rng[[4, 3, 2, 1, 0]]
assert len(fancy_indexed) == 5
@@ -77,10 +77,10 @@ def test_dti_custom_getitem(self):
smaller = rng[:5]
exp = DatetimeIndex(rng.view(np.ndarray)[:5])
tm.assert_index_equal(smaller, exp)
- assert smaller.offset == rng.offset
+ assert smaller.freq == rng.freq
sliced = rng[::5]
- assert sliced.offset == CDay() * 5
+ assert sliced.freq == CDay() * 5
fancy_indexed = rng[[4, 3, 2, 1, 0]]
assert len(fancy_indexed) == 5
diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py
index 8986828399a98..3c7d5d37e98f3 100644
--- a/pandas/tests/indexes/datetimes/test_ops.py
+++ b/pandas/tests/indexes/datetimes/test_ops.py
@@ -405,6 +405,18 @@ def test_equals(self):
assert not idx.equals(list(idx3))
assert not idx.equals(pd.Series(idx3))
+ def test_offset_deprecated(self):
+ # GH 20716
+ idx = pd.DatetimeIndex(['20180101', '20180102'])
+
+ # getter deprecated
+ with tm.assert_produces_warning(FutureWarning):
+ idx.offset
+
+ # setter deprecated
+ with tm.assert_produces_warning(FutureWarning):
+ idx.offset = BDay()
+
class TestBusinessDatetimeIndex(object):
@@ -420,7 +432,7 @@ def test_comparison(self):
def test_pickle_unpickle(self):
unpickled = tm.round_trip_pickle(self.rng)
- assert unpickled.offset is not None
+ assert unpickled.freq is not None
def test_copy(self):
cp = self.rng.copy()
@@ -430,15 +442,15 @@ def test_copy(self):
def test_shift(self):
shifted = self.rng.shift(5)
assert shifted[0] == self.rng[5]
- assert shifted.offset == self.rng.offset
+ assert shifted.freq == self.rng.freq
shifted = self.rng.shift(-5)
assert shifted[5] == self.rng[0]
- assert shifted.offset == self.rng.offset
+ assert shifted.freq == self.rng.freq
shifted = self.rng.shift(0)
assert shifted[0] == self.rng[0]
- assert shifted.offset == self.rng.offset
+ assert shifted.freq == self.rng.freq
rng = date_range(START, END, freq=BMonthEnd())
shifted = rng.shift(1, freq=BDay())
@@ -485,15 +497,15 @@ def test_shift(self):
shifted = self.rng.shift(5)
assert shifted[0] == self.rng[5]
- assert shifted.offset == self.rng.offset
+ assert shifted.freq == self.rng.freq
shifted = self.rng.shift(-5)
assert shifted[5] == self.rng[0]
- assert shifted.offset == self.rng.offset
+ assert shifted.freq == self.rng.freq
shifted = self.rng.shift(0)
assert shifted[0] == self.rng[0]
- assert shifted.offset == self.rng.offset
+ assert shifted.freq == self.rng.freq
# PerformanceWarning
with warnings.catch_warnings(record=True):
@@ -503,7 +515,7 @@ def test_shift(self):
def test_pickle_unpickle(self):
unpickled = tm.round_trip_pickle(self.rng)
- assert unpickled.offset is not None
+ assert unpickled.freq is not None
def test_equals(self):
assert not self.rng.equals(list(self.rng))
diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py
index f263ac78cd343..4580d9fff31d5 100644
--- a/pandas/tests/indexes/datetimes/test_partial_slicing.py
+++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py
@@ -91,6 +91,27 @@ def test_slice_duplicate_monotonic(self):
expected = Timestamp('2017-01-01')
assert result == expected
+ def test_monotone_DTI_indexing_bug(self):
+ # GH 19362
+ # Testing accessing the first element in a montononic descending
+ # partial string indexing.
+
+ df = pd.DataFrame(list(range(5)))
+ date_list = ['2018-01-02', '2017-02-10', '2016-03-10',
+ '2015-03-15', '2014-03-16']
+ date_index = pd.to_datetime(date_list)
+ df['date'] = date_index
+ expected = pd.DataFrame({0: list(range(5)), 'date': date_index})
+ tm.assert_frame_equal(df, expected)
+
+ df = pd.DataFrame({'A': [1, 2, 3]},
+ index=pd.date_range('20170101',
+ periods=3)[::-1])
+ expected = pd.DataFrame({'A': 1},
+ index=pd.date_range('20170103',
+ periods=1))
+ tm.assert_frame_equal(df.loc['2017-01-03'], expected)
+
def test_slice_year(self):
dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500)
diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py
index 84632e59e2bfb..cb9364edc0cc3 100644
--- a/pandas/tests/indexes/datetimes/test_setops.py
+++ b/pandas/tests/indexes/datetimes/test_setops.py
@@ -357,7 +357,7 @@ def test_intersection(self):
expected = rng[10:25]
tm.assert_index_equal(the_int, expected)
assert isinstance(the_int, DatetimeIndex)
- assert the_int.offset == rng.offset
+ assert the_int.freq == rng.freq
the_int = rng1.intersection(rng2.view(DatetimeIndex))
tm.assert_index_equal(the_int, expected)
diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py
index c3926cc5f1633..daf44a559cf5c 100644
--- a/pandas/tests/indexes/period/test_formats.py
+++ b/pandas/tests/indexes/period/test_formats.py
@@ -13,7 +13,7 @@ def test_to_native_types():
# First, with no arguments.
expected = np.array(['2017-01-01', '2017-01-02',
- '2017-01-03'], dtype=' 1
+ for alpha in (-0.5, 1.5):
with pytest.raises(ValueError):
- c(halflife=0)
+ c(alpha=alpha)
- # not valid: alpha <= 0 or alpha > 1
- for alpha in (-0.5, 1.5):
- with pytest.raises(ValueError):
- c(alpha=alpha)
-
- def test_numpy_compat(self):
+ @pytest.mark.parametrize(
+ 'method', ['std', 'mean', 'var'])
+ def test_numpy_compat(self, method):
# see gh-12811
e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5)
msg = "numpy operations are not valid with window objects"
- for func in ('std', 'mean', 'var'):
- tm.assert_raises_regex(UnsupportedFunctionCall, msg,
- getattr(e, func), 1, 2, 3)
- tm.assert_raises_regex(UnsupportedFunctionCall, msg,
- getattr(e, func), dtype=np.float64)
+ tm.assert_raises_regex(UnsupportedFunctionCall, msg,
+ getattr(e, method), 1, 2, 3)
+ tm.assert_raises_regex(UnsupportedFunctionCall, msg,
+ getattr(e, method), dtype=np.float64)
# gh-12373 : rolling functions error on float32 data
@@ -943,11 +987,8 @@ def test_cmov_window_na_min_periods(self):
tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
- def test_cmov_window_regular(self):
+ def test_cmov_window_regular(self, win_types):
# GH 8238
- win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman',
- 'blackmanharris', 'nuttall', 'barthann']
-
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
10.63, 14.48])
xps = {
@@ -969,33 +1010,25 @@ def test_cmov_window_regular(self):
14.0825, 11.5675, np.nan, np.nan]
}
- for wt in win_types:
- xp = Series(xps[wt])
- rs = Series(vals).rolling(5, win_type=wt, center=True).mean()
- tm.assert_series_equal(xp, rs)
+ xp = Series(xps[win_types])
+ rs = Series(vals).rolling(5, win_type=win_types, center=True).mean()
+ tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
- def test_cmov_window_regular_linear_range(self):
+ def test_cmov_window_regular_linear_range(self, win_types):
# GH 8238
- win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman',
- 'blackmanharris', 'nuttall', 'barthann']
-
vals = np.array(range(10), dtype=np.float)
xp = vals.copy()
xp[:2] = np.nan
xp[-2:] = np.nan
xp = Series(xp)
- for wt in win_types:
- rs = Series(vals).rolling(5, win_type=wt, center=True).mean()
- tm.assert_series_equal(xp, rs)
+ rs = Series(vals).rolling(5, win_type=win_types, center=True).mean()
+ tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
- def test_cmov_window_regular_missing_data(self):
+ def test_cmov_window_regular_missing_data(self, win_types):
# GH 8238
- win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman',
- 'blackmanharris', 'nuttall', 'barthann']
-
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan,
10.63, 14.48])
xps = {
@@ -1017,17 +1050,18 @@ def test_cmov_window_regular_missing_data(self):
9.16438, 13.05052, 14.02175, 16.1098, 13.65509]
}
- for wt in win_types:
- xp = Series(xps[wt])
- rs = Series(vals).rolling(5, win_type=wt, min_periods=3).mean()
- tm.assert_series_equal(xp, rs)
+ xp = Series(xps[win_types])
+ rs = Series(vals).rolling(5, win_type=win_types, min_periods=3).mean()
+ tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
- def test_cmov_window_special(self):
+ def test_cmov_window_special(self, win_types_special):
# GH 8238
- win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian']
- kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2.,
- 'width': 2.}, {'width': 0.5}]
+ kwds = {
+ 'kaiser': {'beta': 1.},
+ 'gaussian': {'std': 1.},
+ 'general_gaussian': {'power': 2., 'width': 2.},
+ 'slepian': {'width': 0.5}}
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
10.63, 14.48])
@@ -1043,17 +1077,20 @@ def test_cmov_window_special(self):
12.90702, 12.83757, np.nan, np.nan]
}
- for wt, k in zip(win_types, kwds):
- xp = Series(xps[wt])
- rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k)
- tm.assert_series_equal(xp, rs)
+ xp = Series(xps[win_types_special])
+ rs = Series(vals).rolling(
+ 5, win_type=win_types_special, center=True).mean(
+ **kwds[win_types_special])
+ tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
- def test_cmov_window_special_linear_range(self):
+ def test_cmov_window_special_linear_range(self, win_types_special):
# GH 8238
- win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian']
- kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2.,
- 'width': 2.}, {'width': 0.5}]
+ kwds = {
+ 'kaiser': {'beta': 1.},
+ 'gaussian': {'std': 1.},
+ 'general_gaussian': {'power': 2., 'width': 2.},
+ 'slepian': {'width': 0.5}}
vals = np.array(range(10), dtype=np.float)
xp = vals.copy()
@@ -1061,9 +1098,10 @@ def test_cmov_window_special_linear_range(self):
xp[-2:] = np.nan
xp = Series(xp)
- for wt, k in zip(win_types, kwds):
- rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k)
- tm.assert_series_equal(xp, rs)
+ rs = Series(vals).rolling(
+ 5, win_type=win_types_special, center=True).mean(
+ **kwds[win_types_special])
+ tm.assert_series_equal(xp, rs)
def test_rolling_median(self):
self._check_moment_func(np.median, name='median')
@@ -1150,43 +1188,76 @@ def test_rolling_quantile_param(self):
with pytest.raises(TypeError):
ser.rolling(3).quantile('foo')
- def test_rolling_apply(self):
+ def test_rolling_apply(self, raw):
# suppress warnings about empty slices, as we are deliberately testing
# with a 0-length Series
+
with warnings.catch_warnings():
warnings.filterwarnings("ignore",
message=".*(empty slice|0 for slice).*",
category=RuntimeWarning)
- ser = Series([])
- tm.assert_series_equal(ser,
- ser.rolling(10).apply(lambda x: x.mean()))
-
def f(x):
return x[np.isfinite(x)].mean()
- self._check_moment_func(np.mean, name='apply', func=f)
+ self._check_moment_func(np.mean, name='apply', func=f, raw=raw)
- # GH 8080
+ expected = Series([])
+ result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw)
+ tm.assert_series_equal(result, expected)
+
+ # gh-8080
s = Series([None, None, None])
- result = s.rolling(2, min_periods=0).apply(lambda x: len(x))
+ result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw)
expected = Series([1., 2., 2.])
tm.assert_series_equal(result, expected)
- result = s.rolling(2, min_periods=0).apply(len)
+ result = s.rolling(2, min_periods=0).apply(len, raw=raw)
tm.assert_series_equal(result, expected)
- def test_rolling_apply_out_of_bounds(self):
- # #1850
+ @pytest.mark.parametrize('klass', [Series, DataFrame])
+ @pytest.mark.parametrize(
+ 'method', [lambda x: x.rolling(window=2), lambda x: x.expanding()])
+ def test_apply_future_warning(self, klass, method):
+
+ # gh-5071
+ s = klass(np.arange(3))
+
+ with tm.assert_produces_warning(FutureWarning):
+ method(s).apply(lambda x: len(x))
+
+ def test_rolling_apply_out_of_bounds(self, raw):
+ # gh-1850
vals = pd.Series([1, 2, 3, 4])
- result = vals.rolling(10).apply(np.sum)
+ result = vals.rolling(10).apply(np.sum, raw=raw)
assert result.isna().all()
- result = vals.rolling(10, min_periods=1).apply(np.sum)
+ result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw)
expected = pd.Series([1, 3, 6, 10], dtype=float)
tm.assert_almost_equal(result, expected)
+ @pytest.mark.parametrize('window', [2, '2s'])
+ def test_rolling_apply_with_pandas_objects(self, window):
+ # 5071
+ df = pd.DataFrame({'A': np.random.randn(5),
+ 'B': np.random.randint(0, 10, size=5)},
+ index=pd.date_range('20130101', periods=5, freq='s'))
+
+ # we have an equal spaced timeseries index
+ # so simulate removing the first period
+ def f(x):
+ if x.index[0] == df.index[0]:
+ return np.nan
+ return x.iloc[-1]
+
+ result = df.rolling(window).apply(f, raw=False)
+ expected = df.iloc[2:].reindex_like(df)
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(AttributeError):
+ df.rolling(window).apply(f, raw=True)
+
def test_rolling_std(self):
self._check_moment_func(lambda x: np.std(x, ddof=1),
name='std')
@@ -1256,10 +1327,10 @@ def get_result(obj, window, min_periods=None, center=False):
frame_result = get_result(self.frame, window=50)
assert isinstance(frame_result, DataFrame)
- tm.assert_series_equal(frame_result.iloc[-1, :],
- self.frame.iloc[-50:, :].apply(static_comp,
- axis=0),
- check_names=False)
+ tm.assert_series_equal(
+ frame_result.iloc[-1, :],
+ self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw),
+ check_names=False)
# check time_rule works
if has_time_rule:
@@ -1287,7 +1358,7 @@ def get_result(obj, window, min_periods=None, center=False):
static_comp(trunc_series))
tm.assert_series_equal(frame_result.xs(last_date),
- trunc_frame.apply(static_comp),
+ trunc_frame.apply(static_comp, raw=raw),
check_names=False)
# excluding NaNs correctly
@@ -1402,26 +1473,20 @@ def test_ewma(self):
result = vals.ewm(span=100, adjust=False).mean().sum()
assert np.abs(result - 1) < 1e-2
+ @pytest.mark.parametrize('adjust', [True, False])
+ @pytest.mark.parametrize('ignore_na', [True, False])
+ def test_ewma_cases(self, adjust, ignore_na):
+ # try adjust/ignore_na args matrix
+
s = Series([1.0, 2.0, 4.0, 8.0])
- expected = Series([1.0, 1.6, 2.736842, 4.923077])
- for f in [lambda s: s.ewm(com=2.0, adjust=True).mean(),
- lambda s: s.ewm(com=2.0, adjust=True,
- ignore_na=False).mean(),
- lambda s: s.ewm(com=2.0, adjust=True, ignore_na=True).mean(),
- ]:
- result = f(s)
- tm.assert_series_equal(result, expected)
+ if adjust:
+ expected = Series([1.0, 1.6, 2.736842, 4.923077])
+ else:
+ expected = Series([1.0, 1.333333, 2.222222, 4.148148])
- expected = Series([1.0, 1.333333, 2.222222, 4.148148])
- for f in [lambda s: s.ewm(com=2.0, adjust=False).mean(),
- lambda s: s.ewm(com=2.0, adjust=False,
- ignore_na=False).mean(),
- lambda s: s.ewm(com=2.0, adjust=False,
- ignore_na=True).mean(),
- ]:
- result = f(s)
- tm.assert_series_equal(result, expected)
+ result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean()
+ tm.assert_series_equal(result, expected)
def test_ewma_nan_handling(self):
s = Series([1.] + [np.nan] * 5 + [1.])
@@ -1555,14 +1620,13 @@ def test_ewm_domain_checks(self):
s.ewm(alpha=1.0)
pytest.raises(ValueError, s.ewm, alpha=1.1)
- def test_ew_empty_series(self):
+ @pytest.mark.parametrize('method', ['mean', 'vol', 'var'])
+ def test_ew_empty_series(self, method):
vals = pd.Series([], dtype=np.float64)
ewm = vals.ewm(3)
- funcs = ['mean', 'vol', 'var']
- for f in funcs:
- result = getattr(ewm, f)()
- tm.assert_almost_equal(result, vals)
+ result = getattr(ewm, method)()
+ tm.assert_almost_equal(result, vals)
def _check_ew(self, name=None, preserve_nan=False):
series_result = getattr(self.series.ewm(com=10), name)()
@@ -2160,7 +2224,7 @@ def test_expanding_consistency(self, min_periods):
if name == 'count':
expanding_f_result = expanding_f()
expanding_apply_f_result = x.expanding(
- min_periods=0).apply(func=f)
+ min_periods=0).apply(func=f, raw=True)
else:
if name in ['cov', 'corr']:
expanding_f_result = expanding_f(
@@ -2168,7 +2232,7 @@ def test_expanding_consistency(self, min_periods):
else:
expanding_f_result = expanding_f()
expanding_apply_f_result = x.expanding(
- min_periods=min_periods).apply(func=f)
+ min_periods=min_periods).apply(func=f, raw=True)
# GH 9422
if name in ['sum', 'prod']:
@@ -2259,7 +2323,7 @@ def test_rolling_consistency(self, window, min_periods, center):
rolling_f_result = rolling_f()
rolling_apply_f_result = x.rolling(
window=window, min_periods=0,
- center=center).apply(func=f)
+ center=center).apply(func=f, raw=True)
else:
if name in ['cov', 'corr']:
rolling_f_result = rolling_f(
@@ -2268,7 +2332,7 @@ def test_rolling_consistency(self, window, min_periods, center):
rolling_f_result = rolling_f()
rolling_apply_f_result = x.rolling(
window=window, min_periods=min_periods,
- center=center).apply(func=f)
+ center=center).apply(func=f, raw=True)
# GH 9422
if name in ['sum', 'prod']:
@@ -2348,29 +2412,25 @@ def test_corr_sanity(self):
except AssertionError:
print(res)
- def test_flex_binary_frame(self):
- def _check(method):
- series = self.frame[1]
+ @pytest.mark.parametrize('method', ['corr', 'cov'])
+ def test_flex_binary_frame(self, method):
+ series = self.frame[1]
- res = getattr(series.rolling(window=10), method)(self.frame)
- res2 = getattr(self.frame.rolling(window=10), method)(series)
- exp = self.frame.apply(lambda x: getattr(
- series.rolling(window=10), method)(x))
+ res = getattr(series.rolling(window=10), method)(self.frame)
+ res2 = getattr(self.frame.rolling(window=10), method)(series)
+ exp = self.frame.apply(lambda x: getattr(
+ series.rolling(window=10), method)(x))
- tm.assert_frame_equal(res, exp)
- tm.assert_frame_equal(res2, exp)
+ tm.assert_frame_equal(res, exp)
+ tm.assert_frame_equal(res2, exp)
- frame2 = self.frame.copy()
- frame2.values[:] = np.random.randn(*frame2.shape)
+ frame2 = self.frame.copy()
+ frame2.values[:] = np.random.randn(*frame2.shape)
- res3 = getattr(self.frame.rolling(window=10), method)(frame2)
- exp = DataFrame(dict((k, getattr(self.frame[k].rolling(
- window=10), method)(frame2[k])) for k in self.frame))
- tm.assert_frame_equal(res3, exp)
-
- methods = ['corr', 'cov']
- for meth in methods:
- _check(meth)
+ res3 = getattr(self.frame.rolling(window=10), method)(frame2)
+ exp = DataFrame(dict((k, getattr(self.frame[k].rolling(
+ window=10), method)(frame2[k])) for k in self.frame))
+ tm.assert_frame_equal(res3, exp)
def test_ewmcov(self):
self._check_binary_ew('cov')
@@ -2417,19 +2477,24 @@ def func(A, B, com, **kwargs):
pytest.raises(Exception, func, A, randn(50), 20, min_periods=5)
- def test_expanding_apply_args_kwargs(self):
+ def test_expanding_apply_args_kwargs(self, raw):
+
def mean_w_arg(x, const):
return np.mean(x) + const
df = DataFrame(np.random.rand(20, 3))
- expected = df.expanding().apply(np.mean) + 20.
+ expected = df.expanding().apply(np.mean, raw=raw) + 20.
- tm.assert_frame_equal(df.expanding().apply(mean_w_arg, args=(20, )),
- expected)
- tm.assert_frame_equal(df.expanding().apply(mean_w_arg,
- kwargs={'const': 20}),
- expected)
+ result = df.expanding().apply(mean_w_arg,
+ raw=raw,
+ args=(20, ))
+ tm.assert_frame_equal(result, expected)
+
+ result = df.expanding().apply(mean_w_arg,
+ raw=raw,
+ kwargs={'const': 20})
+ tm.assert_frame_equal(result, expected)
def test_expanding_corr(self):
A = self.series.dropna()
@@ -2539,42 +2604,47 @@ def test_rolling_corr_diff_length(self):
result = s1.rolling(window=3, min_periods=2).corr(s2a)
tm.assert_series_equal(result, expected)
- def test_rolling_functions_window_non_shrinkage(self):
+ @pytest.mark.parametrize(
+ 'f',
+ [
+ lambda x: (x.rolling(window=10, min_periods=5)
+ .cov(x, pairwise=False)),
+ lambda x: (x.rolling(window=10, min_periods=5)
+ .corr(x, pairwise=False)),
+ lambda x: x.rolling(window=10, min_periods=5).max(),
+ lambda x: x.rolling(window=10, min_periods=5).min(),
+ lambda x: x.rolling(window=10, min_periods=5).sum(),
+ lambda x: x.rolling(window=10, min_periods=5).mean(),
+ lambda x: x.rolling(window=10, min_periods=5).std(),
+ lambda x: x.rolling(window=10, min_periods=5).var(),
+ lambda x: x.rolling(window=10, min_periods=5).skew(),
+ lambda x: x.rolling(window=10, min_periods=5).kurt(),
+ lambda x: x.rolling(
+ window=10, min_periods=5).quantile(quantile=0.5),
+ lambda x: x.rolling(window=10, min_periods=5).median(),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=False),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=True),
+ lambda x: x.rolling(win_type='boxcar',
+ window=10, min_periods=5).mean()])
+ def test_rolling_functions_window_non_shrinkage(self, f):
# GH 7764
s = Series(range(4))
s_expected = Series(np.nan, index=s.index)
df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B'])
df_expected = DataFrame(np.nan, index=df.index, columns=df.columns)
- functions = [lambda x: (x.rolling(window=10, min_periods=5)
- .cov(x, pairwise=False)),
- lambda x: (x.rolling(window=10, min_periods=5)
- .corr(x, pairwise=False)),
- lambda x: x.rolling(window=10, min_periods=5).max(),
- lambda x: x.rolling(window=10, min_periods=5).min(),
- lambda x: x.rolling(window=10, min_periods=5).sum(),
- lambda x: x.rolling(window=10, min_periods=5).mean(),
- lambda x: x.rolling(window=10, min_periods=5).std(),
- lambda x: x.rolling(window=10, min_periods=5).var(),
- lambda x: x.rolling(window=10, min_periods=5).skew(),
- lambda x: x.rolling(window=10, min_periods=5).kurt(),
- lambda x: x.rolling(
- window=10, min_periods=5).quantile(quantile=0.5),
- lambda x: x.rolling(window=10, min_periods=5).median(),
- lambda x: x.rolling(window=10, min_periods=5).apply(sum),
- lambda x: x.rolling(win_type='boxcar',
- window=10, min_periods=5).mean()]
- for f in functions:
- try:
- s_result = f(s)
- tm.assert_series_equal(s_result, s_expected)
+ try:
+ s_result = f(s)
+ tm.assert_series_equal(s_result, s_expected)
- df_result = f(df)
- tm.assert_frame_equal(df_result, df_expected)
- except (ImportError):
+ df_result = f(df)
+ tm.assert_frame_equal(df_result, df_expected)
+ except (ImportError):
- # scipy needed for rolling_window
- continue
+ # scipy needed for rolling_window
+ pytest.skip("scipy not available")
def test_rolling_functions_window_non_shrinkage_binary(self):
@@ -2620,7 +2690,10 @@ def test_moment_functions_zero_length(self):
lambda x: x.expanding(min_periods=5).kurt(),
lambda x: x.expanding(min_periods=5).quantile(0.5),
lambda x: x.expanding(min_periods=5).median(),
- lambda x: x.expanding(min_periods=5).apply(sum),
+ lambda x: x.expanding(min_periods=5).apply(
+ sum, raw=False),
+ lambda x: x.expanding(min_periods=5).apply(
+ sum, raw=True),
lambda x: x.rolling(window=10).count(),
lambda x: x.rolling(window=10, min_periods=5).cov(
x, pairwise=False),
@@ -2637,7 +2710,10 @@ def test_moment_functions_zero_length(self):
lambda x: x.rolling(
window=10, min_periods=5).quantile(0.5),
lambda x: x.rolling(window=10, min_periods=5).median(),
- lambda x: x.rolling(window=10, min_periods=5).apply(sum),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=False),
+ lambda x: x.rolling(window=10, min_periods=5).apply(
+ sum, raw=True),
lambda x: x.rolling(win_type='boxcar',
window=10, min_periods=5).mean(),
]
@@ -2805,20 +2881,25 @@ def expanding_func(x, min_periods=1, center=False, axis=0):
return getattr(exp, func)()
self._check_expanding(expanding_func, static_comp, preserve_nan=False)
- def test_expanding_apply(self):
+ def test_expanding_apply(self, raw):
def expanding_mean(x, min_periods=1):
+
exp = x.expanding(min_periods=min_periods)
- return exp.apply(lambda x: x.mean())
+ result = exp.apply(lambda x: x.mean(), raw=raw)
+ return result
- self._check_expanding(expanding_mean, np.mean)
+ # TODO(jreback), needed to add preserve_nan=False
+ # here to make this pass
+ self._check_expanding(expanding_mean, np.mean, preserve_nan=False)
ser = Series([])
- tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean()))
+ tm.assert_series_equal(ser, ser.expanding().apply(
+ lambda x: x.mean(), raw=raw))
# GH 8080
s = Series([None, None, None])
- result = s.expanding(min_periods=0).apply(lambda x: len(x))
+ result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw)
expected = Series([1., 2., 3.])
tm.assert_series_equal(result, expected)
@@ -3057,13 +3138,14 @@ def func(x):
expected = g.apply(func)
tm.assert_series_equal(result, expected)
- def test_rolling_apply(self):
+ def test_rolling_apply(self, raw):
g = self.frame.groupby('A')
r = g.rolling(window=4)
# reduction
- result = r.apply(lambda x: x.sum())
- expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum()))
+ result = r.apply(lambda x: x.sum(), raw=raw)
+ expected = g.apply(
+ lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
tm.assert_frame_equal(result, expected)
def test_expanding(self):
@@ -3104,13 +3186,14 @@ def func(x):
expected = g.apply(func)
tm.assert_series_equal(result, expected)
- def test_expanding_apply(self):
+ def test_expanding_apply(self, raw):
g = self.frame.groupby('A')
r = g.expanding()
# reduction
- result = r.apply(lambda x: x.sum())
- expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum()))
+ result = r.apply(lambda x: x.sum(), raw=raw)
+ expected = g.apply(
+ lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw))
tm.assert_frame_equal(result, expected)
@@ -3624,22 +3707,22 @@ def test_ragged_max(self):
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
- def test_ragged_apply(self):
+ def test_ragged_apply(self, raw):
df = self.ragged
f = lambda x: 1
- result = df.rolling(window='1s', min_periods=1).apply(f)
+ result = df.rolling(window='1s', min_periods=1).apply(f, raw=raw)
expected = df.copy()
expected['B'] = 1.
tm.assert_frame_equal(result, expected)
- result = df.rolling(window='2s', min_periods=1).apply(f)
+ result = df.rolling(window='2s', min_periods=1).apply(f, raw=raw)
expected = df.copy()
expected['B'] = 1.
tm.assert_frame_equal(result, expected)
- result = df.rolling(window='5s', min_periods=1).apply(f)
+ result = df.rolling(window='5s', min_periods=1).apply(f, raw=raw)
expected = df.copy()
expected['B'] = 1.
tm.assert_frame_equal(result, expected)
@@ -3662,8 +3745,14 @@ def test_all(self):
expected = er.quantile(0.5)
tm.assert_frame_equal(result, expected)
- result = r.apply(lambda x: 1)
- expected = er.apply(lambda x: 1)
+ def test_all_apply(self, raw):
+
+ df = self.regular * 2
+ er = df.rolling(window=1)
+ r = df.rolling(window='1s')
+
+ result = r.apply(lambda x: 1, raw=raw)
+ expected = er.apply(lambda x: 1, raw=raw)
tm.assert_frame_equal(result, expected)
def test_all2(self):
diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py
index d96ebab615d12..5369b1a94a956 100644
--- a/pandas/tests/tseries/offsets/test_offsets.py
+++ b/pandas/tests/tseries/offsets/test_offsets.py
@@ -2228,8 +2228,6 @@ class TestWeekOfMonth(Base):
_offset = WeekOfMonth
def test_constructor(self):
- tm.assert_raises_regex(ValueError, "^N cannot be 0",
- WeekOfMonth, n=0, week=1, weekday=1)
tm.assert_raises_regex(ValueError, "^Week", WeekOfMonth,
n=1, week=4, weekday=0)
tm.assert_raises_regex(ValueError, "^Week", WeekOfMonth,
@@ -2261,6 +2259,19 @@ def test_offset(self):
(-1, 2, 1, date3, datetime(2010, 12, 21)),
(-1, 2, 1, date4, datetime(2011, 1, 18)),
+ (0, 0, 1, date1, datetime(2011, 1, 4)),
+ (0, 0, 1, date2, datetime(2011, 2, 1)),
+ (0, 0, 1, date3, datetime(2011, 2, 1)),
+ (0, 0, 1, date4, datetime(2011, 2, 1)),
+ (0, 1, 1, date1, datetime(2011, 1, 11)),
+ (0, 1, 1, date2, datetime(2011, 1, 11)),
+ (0, 1, 1, date3, datetime(2011, 2, 8)),
+ (0, 1, 1, date4, datetime(2011, 2, 8)),
+ (0, 0, 1, date1, datetime(2011, 1, 4)),
+ (0, 1, 1, date2, datetime(2011, 1, 11)),
+ (0, 2, 1, date3, datetime(2011, 1, 18)),
+ (0, 3, 1, date4, datetime(2011, 1, 25)),
+
(1, 0, 0, date1, datetime(2011, 2, 7)),
(1, 0, 0, date2, datetime(2011, 2, 7)),
(1, 0, 0, date3, datetime(2011, 2, 7)),
diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py
index 2e4be7fbdeebf..749165f894819 100644
--- a/pandas/tseries/offsets.py
+++ b/pandas/tseries/offsets.py
@@ -1461,9 +1461,6 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0):
self.weekday = weekday
self.week = week
- if self.n == 0:
- raise ValueError('N cannot be 0')
-
if self.weekday < 0 or self.weekday > 6:
raise ValueError('Day must be 0<=day<=6, got {day}'
.format(day=self.weekday))
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index f0d57d1d808a2..0000000000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[build-system]
-requires = [
- "wheel",
- "setuptools",
- "Cython", # required for VCS build, optional for released source
- "numpy==1.9.3; python_version=='3.5'",
- "numpy==1.12.1; python_version=='3.6'",
- "numpy==1.13.1; python_version>='3.7'",
-]
diff --git a/setup.py b/setup.py
index 7fb5358d0950b..973b4c0abcde2 100755
--- a/setup.py
+++ b/setup.py
@@ -748,4 +748,5 @@ def pxd(name):
long_description=LONG_DESCRIPTION,
classifiers=CLASSIFIERS,
platforms='any',
+ python_requires='>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*',
**setuptools_kwargs)