From a6351400e4be77914f7f280030708ac6c6a9ea4d Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:14:23 -0400 Subject: [PATCH 01/33] ENH GH20601 raise an error when the number of levels in a pivot table larger than int32 --- pandas/core/reshape/reshape.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 389f1af48434a..13cb0cc08ca74 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -162,6 +162,8 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift + if np.prod(self.full_shape) > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) From ac224f5dd2280ca72cfa414a7e5893558db49886 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:53:06 -0400 Subject: [PATCH 02/33] TST add a test for pivot table large number of levels causing int32 overflow --- pandas/tests/reshape/test_pivot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 92bedbabdf2f1..3d74ef0685213 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1237,6 +1237,14 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): aggfunc=f_numpy) tm.assert_frame_equal(result, expected) + @pytest.mark.slow + def test_pivot_number_of_levels_larger_than_int32(self): + # GH 20601 + data = DataFrame({'ind1': list(range(1337600)) * 2, + 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + class TestCrosstab(object): From acbc4eb14e781375889dcbde943b892da69a8d55 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:55:56 -0400 Subject: [PATCH 03/33] CLN PEP8 compliance --- pandas/tests/reshape/test_pivot.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3d74ef0685213..cfea1cd10fe79 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1241,9 +1241,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 data = DataFrame({'ind1': list(range(1337600)) * 2, - 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + 'ind2': list(range(3040)) * 2 * 440, + 'count': [1] * 2 * 1337600}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): - data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + data.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') class TestCrosstab(object): From 662ce5f9752438d93297742cb7cb4cb591223a2d Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 02:07:37 -0400 Subject: [PATCH 04/33] DOC add whatsnew entry --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 18c4dca5b69da..6044f2efc6f0e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1177,6 +1177,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) +- Improved error message when the number of levels in a pivot table is too large causing int32 overflow (:issue:`20601`) Other ^^^^^ From 804101c8888c7c9cf33a5224489a4c75c4118fe1 Mon Sep 17 00:00:00 2001 From: Stefano Cianciulli Date: Mon, 16 Apr 2018 11:30:06 +0100 Subject: [PATCH 05/33] Fix issue 17912 (#20705) HDFStore.select_column error reporting --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/io/pytables.py | 11 ++++++----- pandas/tests/io/test_pytables.py | 9 ++++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 18c4dca5b69da..1e094db7d7b64 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1114,6 +1114,7 @@ I/O - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in ``usecols`` parameter in :func:`read_csv` where error is not raised correctly when passing a string. (:issue:`20529`) - Bug in :func:`HDFStore.keys` when reading a file with a softlink causes exception (:issue:`20523`) +- Bug in :func:`HDFStore.select_column` where a key which is not a valid store raised an ``AttributeError`` instead of a ``KeyError`` (:issue:`17912`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f9a496edb45a3..4004a6ea8f6ff 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -887,7 +887,10 @@ def remove(self, key, where=None, start=None, stop=None): where = _ensure_term(where, scope_level=1) try: s = self.get_storer(key) - except: + except KeyError: + # the key is not a valid store, re-raising KeyError + raise + except Exception: if where is not None: raise ValueError( @@ -899,9 +902,6 @@ def remove(self, key, where=None, start=None, stop=None): s._f_remove(recursive=True) return None - if s is None: - raise KeyError('No object named %s in the file' % key) - # remove the node if com._all_none(where, start, stop): s.group._f_remove(recursive=True) @@ -1094,7 +1094,8 @@ def get_storer(self, key): """ return the storer object for a key, raise if not in the file """ group = self.get_node(key) if group is None: - return None + raise KeyError('No object named {} in the file'.format(key)) + s = self._create_storer(group) s.infer_axes() return s diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index b34723d6cf72c..a6a38e005b9b6 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -3836,8 +3836,15 @@ def test_read_column(self): with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df') - store.append('df', df) + # GH 17912 + # HDFStore.select_column should raise a KeyError + # exception if the key is not a valid store + with pytest.raises(KeyError, + message='No object named index in the file'): + store.select_column('df', 'index') + + store.append('df', df) # error pytest.raises(KeyError, store.select_column, 'df', 'foo') From 1e4e04bf47417aadaf11c7d55c206508f2899fa5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Apr 2018 05:33:14 -0500 Subject: [PATCH 06/33] ENH: ExtensionArray.setitem (#19907) --- pandas/core/frame.py | 4 + pandas/core/indexing.py | 43 +++++ pandas/core/internals.py | 76 +++++--- pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/setitem.py | 167 ++++++++++++++++++ .../extension/category/test_categorical.py | 4 + pandas/tests/extension/decimal/array.py | 5 +- pandas/tests/extension/json/test_json.py | 4 + 8 files changed, 280 insertions(+), 24 deletions(-) create mode 100644 pandas/tests/extension/base/setitem.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0e60f16891a52..f476bff4df2cd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3331,7 +3331,11 @@ def reindexer(value): value = reindexer(value).T elif isinstance(value, ExtensionArray): + from pandas.core.series import _sanitize_index + # Explicitly copy here, instead of in _sanitize_index, + # as sanitize_index won't copy an EA, even with copy=True value = value.copy() + value = _sanitize_index(value, self.index, copy=False) elif isinstance(value, Index) or is_sequence(value): from pandas.core.series import _sanitize_index diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9736bba78ab72..5240a4703c242 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2310,6 +2310,49 @@ def check_bool_indexer(ax, key): return result +def check_setitem_lengths(indexer, value, values): + """Validate that value and indexer are the same length. + + An special-case is allowed for when the indexer is a boolean array + and the number of true values equals the length of ``value``. In + this case, no exception is raised. + + Parameters + ---------- + indexer : sequence + The key for the setitem + value : array-like + The value for the setitem + values : array-like + The values being set into + + Returns + ------- + None + + Raises + ------ + ValueError + When the indexer is an ndarray or list and the lengths don't + match. + """ + # boolean with truth values == len of the value is ok too + if isinstance(indexer, (np.ndarray, list)): + if is_list_like(value) and len(indexer) != len(value): + if not (isinstance(indexer, np.ndarray) and + indexer.dtype == np.bool_ and + len(indexer[indexer]) == len(value)): + raise ValueError("cannot set using a list-like indexer " + "with a different length than the value") + # slice + elif isinstance(indexer, slice): + + if is_list_like(value) and len(values): + if len(value) != length_of_indexer(indexer, values): + raise ValueError("cannot set using a slice indexer with a " + "different length than the value") + + def convert_missing_indexer(indexer): """ reverse convert a missing indexer, which is a dict return the scalar indexer and a boolean indicating if we converted diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e8fab3748bacf..37d11296400be 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -66,7 +66,7 @@ import pandas.core.algorithms as algos from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import maybe_convert_indices, length_of_indexer +from pandas.core.indexing import maybe_convert_indices, check_setitem_lengths from pandas.core.arrays import Categorical from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex @@ -817,11 +817,24 @@ def _replace_single(self, *args, **kwargs): return self if kwargs['inplace'] else self.copy() def setitem(self, indexer, value, mgr=None): - """ set the value inplace; return a new block (of a possibly different - dtype) + """Set the value inplace, returning a a maybe different typed block. - indexer is a direct slice/positional indexer; value must be a - compatible shape + Parameters + ---------- + indexer : tuple, list-like, array-like, slice + The subset of self.values to set + value : object + The value being set + mgr : BlockPlacement, optional + + Returns + ------- + Block + + Notes + ----- + `indexer` is a direct slice/positional indexer. `value` must + be a compatible shape. """ # coerce None values, if appropriate if value is None: @@ -876,22 +889,7 @@ def setitem(self, indexer, value, mgr=None): values = transf(values) # length checking - # boolean with truth values == len of the value is ok too - if isinstance(indexer, (np.ndarray, list)): - if is_list_like(value) and len(indexer) != len(value): - if not (isinstance(indexer, np.ndarray) and - indexer.dtype == np.bool_ and - len(indexer[indexer]) == len(value)): - raise ValueError("cannot set using a list-like indexer " - "with a different length than the value") - - # slice - elif isinstance(indexer, slice): - - if is_list_like(value) and len(values): - if len(value) != length_of_indexer(indexer, values): - raise ValueError("cannot set using a slice indexer with a " - "different length than the value") + check_setitem_lengths(indexer, value, values) def _is_scalar_indexer(indexer): # return True if we are all scalar indexers @@ -1900,6 +1898,37 @@ def is_view(self): """Extension arrays are never treated as views.""" return False + def setitem(self, indexer, value, mgr=None): + """Set the value inplace, returning a same-typed block. + + This differs from Block.setitem by not allowing setitem to change + the dtype of the Block. + + Parameters + ---------- + indexer : tuple, list-like, array-like, slice + The subset of self.values to set + value : object + The value being set + mgr : BlockPlacement, optional + + Returns + ------- + Block + + Notes + ----- + `indexer` is a direct slice/positional indexer. `value` must + be a compatible shape. + """ + if isinstance(indexer, tuple): + # we are always 1-D + indexer = indexer[0] + + check_setitem_lengths(indexer, value, self.values) + self.values[indexer] = value + return self + def get_values(self, dtype=None): # ExtensionArrays must be iterable, so this works. values = np.asarray(self.values) @@ -3519,7 +3548,8 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, # with a .values attribute. aligned_args = dict((k, kwargs[k]) for k in align_keys - if hasattr(kwargs[k], 'values')) + if hasattr(kwargs[k], 'values') and + not isinstance(kwargs[k], ABCExtensionArray)) for b in self.blocks: if filter is not None: @@ -5220,7 +5250,7 @@ def _safe_reshape(arr, new_shape): If possible, reshape `arr` to have shape `new_shape`, with a couple of exceptions (see gh-13012): - 1) If `arr` is a Categorical or Index, `arr` will be + 1) If `arr` is a ExtensionArray or Index, `arr` will be returned as is. 2) If `arr` is a Series, the `_values` attribute will be reshaped and returned. diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index f8078d2798b32..9da985625c4ee 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -49,3 +49,4 @@ class TestMyDtype(BaseDtypeTests): from .methods import BaseMethodsTests # noqa from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa +from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py new file mode 100644 index 0000000000000..e91345b504d86 --- /dev/null +++ b/pandas/tests/extension/base/setitem.py @@ -0,0 +1,167 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm +from .base import BaseExtensionTests + + +class BaseSetitemTests(BaseExtensionTests): + def test_setitem_scalar_series(self, data): + arr = pd.Series(data) + arr[0] = data[1] + assert arr[0] == data[1] + + def test_setitem_sequence(self, data): + arr = pd.Series(data) + original = data.copy() + + arr[[0, 1]] = [data[1], data[0]] + assert arr[0] == original[1] + assert arr[1] == original[0] + + @pytest.mark.parametrize('as_array', [True, False]) + def test_setitem_sequence_mismatched_length_raises(self, data, as_array): + ser = pd.Series(data) + value = [data[0]] + if as_array: + value = type(data)(value) + + xpr = 'cannot set using a {} indexer with a different length' + with tm.assert_raises_regex(ValueError, xpr.format('list-like')): + ser[[0, 1]] = value + + with tm.assert_raises_regex(ValueError, xpr.format('slice')): + ser[slice(3)] = value + + def test_setitem_empty_indxer(self, data): + ser = pd.Series(data) + original = ser.copy() + ser[[]] = [] + self.assert_series_equal(ser, original) + + def test_setitem_sequence_broadcasts(self, data): + arr = pd.Series(data) + + arr[[0, 1]] = data[2] + assert arr[0] == data[2] + assert arr[1] == data[2] + + @pytest.mark.parametrize('setter', ['loc', 'iloc']) + def test_setitem_scalar(self, data, setter): + arr = pd.Series(data) + setter = getattr(arr, setter) + operator.setitem(setter, 0, data[1]) + assert arr[0] == data[1] + + def test_setitem_loc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + df.loc[0, 'B'] = data[1] + assert df.loc[0, 'B'] == data[1] + + def test_setitem_loc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + df.loc[10, 'B'] = data[1] + assert df.loc[10, 'B'] == data[1] + + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + df.loc[10, 'B'] = data[1] + assert df.loc[10, 'B'] == data[1] + + def test_setitem_iloc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + df.iloc[0, 1] = data[1] + assert df.loc[0, 'B'] == data[1] + + def test_setitem_iloc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + df.iloc[10, 0] = data[1] + assert df.loc[10, 'B'] == data[1] + + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + df.iloc[10, 1] = data[1] + assert df.loc[10, 'B'] == data[1] + + @pytest.mark.parametrize('as_callable', [True, False]) + @pytest.mark.parametrize('setter', ['loc', None]) + def test_setitem_mask_aligned(self, data, as_callable, setter): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + if as_callable: + mask2 = lambda x: mask + else: + mask2 = mask + + if setter: + # loc + target = getattr(ser, setter) + else: + # Series.__setitem__ + target = ser + + operator.setitem(target, mask2, data[5:7]) + + ser[mask2] = data[5:7] + assert ser[0] == data[5] + assert ser[1] == data[6] + + @pytest.mark.parametrize('setter', ['loc', None]) + def test_setitem_mask_broadcast(self, data, setter): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + if setter: # loc + target = getattr(ser, setter) + else: # __setitem__ + target = ser + + operator.setitem(target, mask, data[10]) + assert ser[0] == data[10] + assert ser[1] == data[10] + + def test_setitem_expand_columns(self, data): + df = pd.DataFrame({"A": data}) + result = df.copy() + result['B'] = 1 + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + self.assert_frame_equal(result, expected) + + result = df.copy() + result.loc[:, 'B'] = 1 + self.assert_frame_equal(result, expected) + + # overwrite with new type + result['B'] = data + expected = pd.DataFrame({"A": data, "B": data}) + self.assert_frame_equal(result, expected) + + def test_setitem_expand_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + result = df.copy() + result['B'] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + self.assert_frame_equal(result, expected) + + result = df.copy() + result.loc[:, 'B'] = data + self.assert_frame_equal(result, expected) + + def test_setitem_frame_invalid_length(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + xpr = "Length of values does not match length of index" + with tm.assert_raises_regex(ValueError, xpr): + df['B'] = data[:5] + + @pytest.mark.xfail(reason="GH-20441: setitem on extension types.") + def test_setitem_tuple_index(self, data): + s = pd.Series(data[:2], index=[(0, 0), (0, 1)]) + expected = pd.Series(data.take([1, 1]), index=s.index) + s[(0, 1)] = data[1] + self.assert_series_equal(s, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 7528299578326..6abf1f7f9a65a 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -85,6 +85,10 @@ def test_getitem_scalar(self): pass +class TestSetitem(base.BaseSetitemTests): + pass + + class TestMissing(base.BaseMissingTests): @pytest.mark.skip(reason="Not implemented") diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index b66a14c77a059..f93d11f579f11 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -31,6 +31,9 @@ def __init__(self, values): values = np.asarray(values, dtype=object) self.values = values + # Some aliases for common attribute names to ensure pandas supports + # these + self._items = self._data = self.data = self.values @classmethod def _constructor_from_sequence(cls, scalars): @@ -62,7 +65,7 @@ def __len__(self): return len(self.values) def __repr__(self): - return repr(self.values) + return 'DecimalArray({!r})'.format(self.values) @property def nbytes(self): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 87668cc1196b6..dcf08440738e7 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -135,6 +135,10 @@ def test_astype_str(self): """ +# We intentionally don't run base.BaseSetitemTests because pandas' +# internals has trouble setting sequences of values into scalar positions. + + class TestGroupby(base.BaseGroupbyTests): @unhashable From 8756f55234e4fa00a116cb105d36861f1bc6100b Mon Sep 17 00:00:00 2001 From: David Hoese Date: Mon, 16 Apr 2018 05:34:35 -0500 Subject: [PATCH 07/33] DEP: Add 'python_requires' to setup.py to drop 3.4 support (#20698) --- ci/environment-dev.yaml | 2 +- ci/requirements_dev.txt | 4 ++-- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.23.0.txt | 2 ++ setup.py | 1 + 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml index 1337fc54e9aac..f9f9208519d61 100644 --- a/ci/environment-dev.yaml +++ b/ci/environment-dev.yaml @@ -11,5 +11,5 @@ dependencies: - python-dateutil>=2.5.0 - python=3 - pytz - - setuptools>=3.3 + - setuptools>=24.2.0 - sphinx diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index fcbe0da5de305..3430e778a4573 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -7,5 +7,5 @@ moto pytest>=3.1 python-dateutil>=2.5.0 pytz -setuptools>=3.3 -sphinx \ No newline at end of file +setuptools>=24.2.0 +sphinx diff --git a/doc/source/install.rst b/doc/source/install.rst index fdb22a8dc3380..c46f78ed6b6f7 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -223,7 +223,7 @@ installed), make sure you have `pytest Dependencies ------------ -* `setuptools `__: 3.3.0 or higher +* `setuptools `__: 24.2.0 or higher * `NumPy `__: 1.9.0 or higher * `python-dateutil `__: 2.5.0 or higher * `pytz `__ diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1e094db7d7b64..1992c27fd11ed 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -435,6 +435,8 @@ If installed, we now require: +-----------------+-----------------+----------+---------------+ | beautifulsoup4 | 4.2.1 | | :issue:`20082`| +-----------------+-----------------+----------+---------------+ +| setuptools | 24.2.0 | | :issue:`20698`| ++-----------------+-----------------+----------+---------------+ .. _whatsnew_0230.api_breaking.dict_insertion_order: diff --git a/setup.py b/setup.py index 7fb5358d0950b..973b4c0abcde2 100755 --- a/setup.py +++ b/setup.py @@ -748,4 +748,5 @@ def pxd(name): long_description=LONG_DESCRIPTION, classifiers=CLASSIFIERS, platforms='any', + python_requires='>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*', **setuptools_kwargs) From da33359b4d19c9bc25710854472cb67918611a2d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 16 Apr 2018 06:36:19 -0400 Subject: [PATCH 08/33] DOC: Correct documentation to GroupBy.rank (#20708) Closes gh-20694. --- pandas/core/groupby/groupby.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7c89cab6b1428..8c20d62117e25 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1841,24 +1841,27 @@ def cumcount(self, ascending=True): @Appender(_doc_template) def rank(self, method='average', ascending=True, na_option='keep', pct=False, axis=0): - """Provides the rank of values within each group + """ + Provides the rank of values within each group. Parameters ---------- - method : {'average', 'min', 'max', 'first', 'dense'}, efault 'average' + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array * dense: like 'min', but rank always increases by 1 between groups - method : {'keep', 'top', 'bottom'}, default 'keep' + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending - ascending : boolean, default True - False for ranks by high (1) to low (N) pct : boolean, default False Compute percentage rank of data within each group + axis : int, default 0 + The axis of the object over which to compute the rank. Returns ----- From 4a344972722cc3c27250cbc8e382472b13e66bde Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 16 Apr 2018 10:54:03 -0400 Subject: [PATCH 09/33] API: rolling.apply will pass Series to function (#20584) closes #5071 --- doc/source/whatsnew/v0.23.0.txt | 32 ++ pandas/_libs/window.pyx | 46 ++- pandas/core/generic.py | 2 + pandas/core/window.py | 54 ++- pandas/tests/test_window.py | 651 ++++++++++++++++++-------------- 5 files changed, 479 insertions(+), 306 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1992c27fd11ed..641214550a3b7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -65,6 +65,35 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp pd.get_dummies(df, columns=['c'], dtype=bool).dtypes +.. _whatsnew_0230.enhancements.window_raw: + +Rolling/Expanding.apply() accepts a ``raw`` keyword to pass a ``Series`` to the function +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, +:func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have gained a ``raw=None`` parameter. +This is similar to :func:`DataFame.apply`. This parameter, if ``True`` allows one to send a ``np.ndarray`` to the applied function. If ``False`` a ``Series`` will be passed. The +default is ``None``, which preserves backward compatibility, so this will default to ``True``, sending an ``np.ndarray``. +In a future version the default will be changed to ``False``, sending a ``Series``. (:issue:`5071`, :issue:`20584`) + +.. ipython:: python + + s = pd.Series(np.arange(5), np.arange(5) + 1) + s + +Pass a ``Series``: + +.. ipython:: python + + s.rolling(2, min_periods=1).apply(lambda x: x.iloc[-1], raw=False) + +Mimic the original behavior of passing a ndarray: + +.. ipython:: python + + s.rolling(2, min_periods=1).apply(lambda x: x[-1], raw=True) + + .. _whatsnew_0230.enhancements.merge_on_columns_and_levels: Merging on a combination of columns and index levels @@ -817,6 +846,7 @@ Other API Changes - :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`) - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`). - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`) +- A user-defined-function that is passed to :func:`Series.rolling().aggregate() `, :func:`DataFrame.rolling().aggregate() `, or its expanding cousins, will now *always* be passed a ``Series``, rather than an ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`) .. _whatsnew_0230.deprecations: @@ -845,6 +875,8 @@ Deprecations - ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`) - ``NDFrame.get_ftype_counts()`` is deprecated and will be removed in a future version (:issue:`18243`) - The ``convert_datetime64`` parameter in :func:`DataFrame.to_records` has been deprecated and will be removed in a future version. The NumPy bug motivating this parameter has been resolved. The default value for this parameter has also changed from ``True`` to ``None`` (:issue:`18160`). +- :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, + :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index aa13f03d8e9e4..e524f823605a4 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1432,30 +1432,35 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, return output -def roll_generic(ndarray[float64_t, cast=True] input, +def roll_generic(object obj, int64_t win, int64_t minp, object index, object closed, - int offset, object func, + int offset, object func, bint raw, object args, object kwargs): cdef: ndarray[double_t] output, counts, bufarr + ndarray[float64_t, cast=True] arr float64_t *buf float64_t *oldbuf int64_t nobs = 0, i, j, s, e, N bint is_variable ndarray[int64_t] start, end - if not input.flags.c_contiguous: - input = input.copy('C') - - n = len(input) + n = len(obj) if n == 0: - return input + return obj + + arr = np.asarray(obj) + + # ndarray input + if raw: + if not arr.flags.c_contiguous: + arr = arr.copy('C') - counts = roll_sum(np.concatenate([np.isfinite(input).astype(float), + counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float), np.array([0.] * offset)]), win, minp, index, closed)[offset:] - start, end, N, win, minp, is_variable = get_window_indexer(input, win, + start, end, N, win, minp, is_variable = get_window_indexer(arr, win, minp, index, closed, floor=0) @@ -1463,8 +1468,8 @@ def roll_generic(ndarray[float64_t, cast=True] input, output = np.empty(N, dtype=float) if is_variable: + # variable window arr or series - # variable window if offset != 0: raise ValueError("unable to roll_generic with a non-zero offset") @@ -1473,7 +1478,20 @@ def roll_generic(ndarray[float64_t, cast=True] input, e = end[i] if counts[i] >= minp: - output[i] = func(input[s:e], *args, **kwargs) + if raw: + output[i] = func(arr[s:e], *args, **kwargs) + else: + output[i] = func(obj.iloc[s:e], *args, **kwargs) + else: + output[i] = NaN + + elif not raw: + # series + for i from 0 <= i < N: + if counts[i] >= minp: + sl = slice(int_max(i + offset - win + 1, 0), + int_min(i + offset + 1, N)) + output[i] = func(obj.iloc[sl], *args, **kwargs) else: output[i] = NaN @@ -1482,12 +1500,12 @@ def roll_generic(ndarray[float64_t, cast=True] input, # truncated windows at the beginning, through first full-length window for i from 0 <= i < (int_min(win, N) - offset): if counts[i] >= minp: - output[i] = func(input[0: (i + offset + 1)], *args, **kwargs) + output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs) else: output[i] = NaN # remaining full-length windows - buf = input.data + buf = arr.data bufarr = np.empty(win, dtype=float) oldbuf = bufarr.data for i from (win - offset) <= i < (N - offset): @@ -1502,7 +1520,7 @@ def roll_generic(ndarray[float64_t, cast=True] input, # truncated windows at the end for i from int_max(N - offset, 0) <= i < N: if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0): N], + output[i] = func(arr[int_max(i + offset - win + 1, 0): N], *args, **kwargs) else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ae9d160db08e9..d3ab7afc025c9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4292,6 +4292,8 @@ def pipe(self, func, *args, **kwargs): Notes ----- `agg` is an alias for `aggregate`. Use the alias. + + A passed user-defined-function will be passed a Series for evaluation. """) _shared_docs['transform'] = (""" diff --git a/pandas/core/window.py b/pandas/core/window.py index 5cd4fffb5d7dd..f8b5aa292f309 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -314,7 +314,7 @@ def _center_window(self, result, window): def aggregate(self, arg, *args, **kwargs): result, how = self._aggregate(arg, *args, **kwargs) if result is None: - return self.apply(arg, args=args, kwargs=kwargs) + return self.apply(arg, raw=False, args=args, kwargs=kwargs) return result agg = aggregate @@ -954,23 +954,53 @@ def count(self): Parameters ---------- func : function - Must produce a single value from an ndarray input - \*args and \*\*kwargs are passed to the function""") + Must produce a single value from an ndarray input if ``raw=True`` + or a Series if ``raw=False`` + raw : bool, default None + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` or ``None`` : the passed function will receive ndarray + objects instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + + The `raw` parameter is required and will show a FutureWarning if + not passed. In the future `raw` will default to False. + + .. versionadded:: 0.23.0 + + \*args and \*\*kwargs are passed to the function""") + + def apply(self, func, raw=None, args=(), kwargs={}): + from pandas import Series - def apply(self, func, args=(), kwargs={}): # TODO: _level is unused? _level = kwargs.pop('_level', None) # noqa window = self._get_window() offset = _offset(window, self.center) index, indexi = self._get_index() + # TODO: default is for backward compat + # change to False in the future + if raw is None: + warnings.warn( + "Currently, 'apply' passes the values as ndarrays to the " + "applied function. In the future, this will change to passing " + "it as Series objects. You need to specify 'raw=True' to keep " + "the current behaviour, and you can pass 'raw=False' to " + "silence this warning", FutureWarning, stacklevel=3) + raw = True + def f(arg, window, min_periods, closed): minp = _use_window(min_periods, window) - return _window.roll_generic(arg, window, minp, indexi, closed, - offset, func, args, kwargs) + if not raw: + arg = Series(arg, index=self.obj.index) + return _window.roll_generic( + arg, window, minp, indexi, + closed, offset, func, raw, args, kwargs) return self._apply(f, func, args=args, kwargs=kwargs, - center=False) + center=False, raw=raw) def sum(self, *args, **kwargs): nv.validate_window_func('sum', args, kwargs) @@ -1498,8 +1528,9 @@ def count(self): @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['apply']) - def apply(self, func, args=(), kwargs={}): - return super(Rolling, self).apply(func, args=args, kwargs=kwargs) + def apply(self, func, raw=None, args=(), kwargs={}): + return super(Rolling, self).apply( + func, raw=raw, args=args, kwargs=kwargs) @Substitution(name='rolling') @Appender(_shared_docs['sum']) @@ -1756,8 +1787,9 @@ def count(self, **kwargs): @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['apply']) - def apply(self, func, args=(), kwargs={}): - return super(Expanding, self).apply(func, args=args, kwargs=kwargs) + def apply(self, func, raw=None, args=(), kwargs={}): + return super(Expanding, self).apply( + func, raw=raw, args=args, kwargs=kwargs) @Substitution(name='expanding') @Appender(_shared_docs['sum']) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index dabdb1e8e689c..605230390ff1d 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -29,6 +29,22 @@ def assert_equal(left, right): tm.assert_frame_equal(left, right) +@pytest.fixture(params=[True, False]) +def raw(request): + return request.param + + +@pytest.fixture(params=['triang', 'blackman', 'hamming', 'bartlett', 'bohman', + 'blackmanharris', 'nuttall', 'barthann']) +def win_types(request): + return request.param + + +@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian', 'slepian']) +def win_types_special(request): + return request.param + + class Base(object): _nan_locs = np.arange(20, 40) @@ -157,9 +173,16 @@ def test_agg(self): expected.columns = pd.MultiIndex.from_tuples(exp_cols) tm.assert_frame_equal(result, expected, check_like=True) + def test_agg_apply(self, raw): + # passed lambda + df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + + r = df.rolling(window=3) + a_sum = r['A'].sum() + result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) - rcustom = r['B'].apply(lambda x: np.std(x, ddof=1)) + rcustom = r['B'].apply(lambda x: np.std(x, ddof=1), raw=raw) expected = concat([a_sum, rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) @@ -289,43 +312,51 @@ def setup_method(self, method): self._create_data() @td.skip_if_no_scipy - def test_constructor(self): + @pytest.mark.parametrize( + 'which', ['series', 'frame']) + def test_constructor(self, which): # GH 12669 - for o in [self.series, self.frame]: - c = o.rolling + o = getattr(self, which) + c = o.rolling - # valid - c(win_type='boxcar', window=2, min_periods=1) - c(win_type='boxcar', window=2, min_periods=1, center=True) - c(win_type='boxcar', window=2, min_periods=1, center=False) + # valid + c(win_type='boxcar', window=2, min_periods=1) + c(win_type='boxcar', window=2, min_periods=1, center=True) + c(win_type='boxcar', window=2, min_periods=1, center=False) - for wt in ['boxcar', 'triang', 'blackman', 'hamming', 'bartlett', - 'bohman', 'blackmanharris', 'nuttall', 'barthann']: - c(win_type=wt, window=2) + # not valid + for w in [2., 'foo', np.array([2])]: + with pytest.raises(ValueError): + c(win_type='boxcar', window=2, min_periods=w) + with pytest.raises(ValueError): + c(win_type='boxcar', window=2, min_periods=1, center=w) - # not valid - for w in [2., 'foo', np.array([2])]: - with pytest.raises(ValueError): - c(win_type='boxcar', window=2, min_periods=w) - with pytest.raises(ValueError): - c(win_type='boxcar', window=2, min_periods=1, center=w) + for wt in ['foobar', 1]: + with pytest.raises(ValueError): + c(win_type=wt, window=2) - for wt in ['foobar', 1]: - with pytest.raises(ValueError): - c(win_type=wt, window=2) + @td.skip_if_no_scipy + @pytest.mark.parametrize( + 'which', ['series', 'frame']) + def test_constructor_with_win_type(self, which, win_types): + # GH 12669 + o = getattr(self, which) + c = o.rolling + c(win_type=win_types, window=2) - def test_numpy_compat(self): + @pytest.mark.parametrize( + 'method', ['sum', 'mean']) + def test_numpy_compat(self, method): # see gh-12811 w = rwindow.Window(Series([2, 4, 6]), window=[0, 2]) msg = "numpy operations are not valid with window objects" - for func in ('sum', 'mean'): - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(w, func), 1, 2, 3) - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(w, func), dtype=np.float64) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(w, method), 1, 2, 3) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(w, method), dtype=np.float64) class TestRolling(Base): @@ -340,59 +371,65 @@ def test_doc_string(self): df.rolling(2).sum() df.rolling(2, min_periods=1).sum() - def test_constructor(self): + @pytest.mark.parametrize( + 'which', ['series', 'frame']) + def test_constructor(self, which): # GH 12669 - for o in [self.series, self.frame]: - c = o.rolling + o = getattr(self, which) + c = o.rolling - # valid - c(window=2) - c(window=2, min_periods=1) - c(window=2, min_periods=1, center=True) - c(window=2, min_periods=1, center=False) + # valid + c(window=2) + c(window=2, min_periods=1) + c(window=2, min_periods=1, center=True) + c(window=2, min_periods=1, center=False) - # GH 13383 - c(0) - with pytest.raises(ValueError): - c(-1) + # GH 13383 + c(0) + with pytest.raises(ValueError): + c(-1) - # not valid - for w in [2., 'foo', np.array([2])]: - with pytest.raises(ValueError): - c(window=w) - with pytest.raises(ValueError): - c(window=2, min_periods=w) - with pytest.raises(ValueError): - c(window=2, min_periods=1, center=w) + # not valid + for w in [2., 'foo', np.array([2])]: + with pytest.raises(ValueError): + c(window=w) + with pytest.raises(ValueError): + c(window=2, min_periods=w) + with pytest.raises(ValueError): + c(window=2, min_periods=1, center=w) @td.skip_if_no_scipy - def test_constructor_with_win_type(self): + @pytest.mark.parametrize( + 'which', ['series', 'frame']) + def test_constructor_with_win_type(self, which): # GH 13383 - for o in [self.series, self.frame]: - c = o.rolling - c(0, win_type='boxcar') - with pytest.raises(ValueError): - c(-1, win_type='boxcar') + o = getattr(self, which) + c = o.rolling + c(0, win_type='boxcar') + with pytest.raises(ValueError): + c(-1, win_type='boxcar') - def test_constructor_with_timedelta_window(self): + @pytest.mark.parametrize( + 'window', [timedelta(days=3), pd.Timedelta(days=3)]) + def test_constructor_with_timedelta_window(self, window): # GH 15440 n = 10 df = DataFrame({'value': np.arange(n)}, index=pd.date_range('2015-12-24', periods=n, freq="D")) expected_data = np.append([0., 1.], np.arange(3., 27., 3)) - for window in [timedelta(days=3), pd.Timedelta(days=3)]: - result = df.rolling(window=window).sum() - expected = DataFrame({'value': expected_data}, - index=pd.date_range('2015-12-24', periods=n, - freq="D")) - tm.assert_frame_equal(result, expected) - expected = df.rolling('3D').sum() - tm.assert_frame_equal(result, expected) + + result = df.rolling(window=window).sum() + expected = DataFrame({'value': expected_data}, + index=pd.date_range('2015-12-24', periods=n, + freq="D")) + tm.assert_frame_equal(result, expected) + expected = df.rolling('3D').sum() + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( 'window', [timedelta(days=3), pd.Timedelta(days=3), '3D']) - def test_constructor_with_timedelta_window_and_minperiods(self, window): + def test_constructor_timedelta_window_and_minperiods(self, window, raw): # GH 15305 n = 10 df = DataFrame({'value': np.arange(n)}, @@ -402,21 +439,22 @@ def test_constructor_with_timedelta_window_and_minperiods(self, window): index=pd.date_range('2017-08-08', periods=n, freq="D")) result_roll_sum = df.rolling(window=window, min_periods=2).sum() result_roll_generic = df.rolling(window=window, - min_periods=2).apply(sum) + min_periods=2).apply(sum, raw=raw) tm.assert_frame_equal(result_roll_sum, expected) tm.assert_frame_equal(result_roll_generic, expected) - def test_numpy_compat(self): + @pytest.mark.parametrize( + 'method', ['std', 'mean', 'sum', 'max', 'min', 'var']) + def test_numpy_compat(self, method): # see gh-12811 r = rwindow.Rolling(Series([2, 4, 6]), window=2) msg = "numpy operations are not valid with window objects" - for func in ('std', 'mean', 'sum', 'max', 'min', 'var'): - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(r, func), 1, 2, 3) - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(r, func), dtype=np.float64) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(r, method), 1, 2, 3) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(r, method), dtype=np.float64) def test_closed(self): df = DataFrame({'A': [0, 1, 2, 3, 4]}) @@ -483,35 +521,38 @@ def test_doc_string(self): df df.expanding(2).sum() - def test_constructor(self): + @pytest.mark.parametrize( + 'which', ['series', 'frame']) + def test_constructor(self, which): # GH 12669 - for o in [self.series, self.frame]: - c = o.expanding + o = getattr(self, which) + c = o.expanding - # valid - c(min_periods=1) - c(min_periods=1, center=True) - c(min_periods=1, center=False) + # valid + c(min_periods=1) + c(min_periods=1, center=True) + c(min_periods=1, center=False) - # not valid - for w in [2., 'foo', np.array([2])]: - with pytest.raises(ValueError): - c(min_periods=w) - with pytest.raises(ValueError): - c(min_periods=1, center=w) + # not valid + for w in [2., 'foo', np.array([2])]: + with pytest.raises(ValueError): + c(min_periods=w) + with pytest.raises(ValueError): + c(min_periods=1, center=w) - def test_numpy_compat(self): + @pytest.mark.parametrize( + 'method', ['std', 'mean', 'sum', 'max', 'min', 'var']) + def test_numpy_compat(self, method): # see gh-12811 e = rwindow.Expanding(Series([2, 4, 6]), window=2) msg = "numpy operations are not valid with window objects" - for func in ('std', 'mean', 'sum', 'max', 'min', 'var'): - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(e, func), 1, 2, 3) - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(e, func), dtype=np.float64) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(e, method), 1, 2, 3) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(e, method), dtype=np.float64) @pytest.mark.parametrize( 'expander', @@ -558,55 +599,58 @@ def test_doc_string(self): df df.ewm(com=0.5).mean() - def test_constructor(self): - for o in [self.series, self.frame]: - c = o.ewm - - # valid - c(com=0.5) - c(span=1.5) - c(alpha=0.5) - c(halflife=0.75) - c(com=0.5, span=None) - c(alpha=0.5, com=None) - c(halflife=0.75, alpha=None) + @pytest.mark.parametrize( + 'which', ['series', 'frame']) + def test_constructor(self, which): + o = getattr(self, which) + c = o.ewm + + # valid + c(com=0.5) + c(span=1.5) + c(alpha=0.5) + c(halflife=0.75) + c(com=0.5, span=None) + c(alpha=0.5, com=None) + c(halflife=0.75, alpha=None) + + # not valid: mutually exclusive + with pytest.raises(ValueError): + c(com=0.5, alpha=0.5) + with pytest.raises(ValueError): + c(span=1.5, halflife=0.75) + with pytest.raises(ValueError): + c(alpha=0.5, span=1.5) - # not valid: mutually exclusive - with pytest.raises(ValueError): - c(com=0.5, alpha=0.5) - with pytest.raises(ValueError): - c(span=1.5, halflife=0.75) - with pytest.raises(ValueError): - c(alpha=0.5, span=1.5) + # not valid: com < 0 + with pytest.raises(ValueError): + c(com=-0.5) - # not valid: com < 0 - with pytest.raises(ValueError): - c(com=-0.5) + # not valid: span < 1 + with pytest.raises(ValueError): + c(span=0.5) - # not valid: span < 1 - with pytest.raises(ValueError): - c(span=0.5) + # not valid: halflife <= 0 + with pytest.raises(ValueError): + c(halflife=0) - # not valid: halflife <= 0 + # not valid: alpha <= 0 or alpha > 1 + for alpha in (-0.5, 1.5): with pytest.raises(ValueError): - c(halflife=0) + c(alpha=alpha) - # not valid: alpha <= 0 or alpha > 1 - for alpha in (-0.5, 1.5): - with pytest.raises(ValueError): - c(alpha=alpha) - - def test_numpy_compat(self): + @pytest.mark.parametrize( + 'method', ['std', 'mean', 'var']) + def test_numpy_compat(self, method): # see gh-12811 e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) msg = "numpy operations are not valid with window objects" - for func in ('std', 'mean', 'var'): - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(e, func), 1, 2, 3) - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(e, func), dtype=np.float64) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(e, method), 1, 2, 3) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(e, method), dtype=np.float64) # gh-12373 : rolling functions error on float32 data @@ -943,11 +987,8 @@ def test_cmov_window_na_min_periods(self): tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy - def test_cmov_window_regular(self): + def test_cmov_window_regular(self, win_types): # GH 8238 - win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', - 'blackmanharris', 'nuttall', 'barthann'] - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) xps = { @@ -969,33 +1010,25 @@ def test_cmov_window_regular(self): 14.0825, 11.5675, np.nan, np.nan] } - for wt in win_types: - xp = Series(xps[wt]) - rs = Series(vals).rolling(5, win_type=wt, center=True).mean() - tm.assert_series_equal(xp, rs) + xp = Series(xps[win_types]) + rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() + tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy - def test_cmov_window_regular_linear_range(self): + def test_cmov_window_regular_linear_range(self, win_types): # GH 8238 - win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', - 'blackmanharris', 'nuttall', 'barthann'] - vals = np.array(range(10), dtype=np.float) xp = vals.copy() xp[:2] = np.nan xp[-2:] = np.nan xp = Series(xp) - for wt in win_types: - rs = Series(vals).rolling(5, win_type=wt, center=True).mean() - tm.assert_series_equal(xp, rs) + rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() + tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy - def test_cmov_window_regular_missing_data(self): + def test_cmov_window_regular_missing_data(self, win_types): # GH 8238 - win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', - 'blackmanharris', 'nuttall', 'barthann'] - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48]) xps = { @@ -1017,17 +1050,18 @@ def test_cmov_window_regular_missing_data(self): 9.16438, 13.05052, 14.02175, 16.1098, 13.65509] } - for wt in win_types: - xp = Series(xps[wt]) - rs = Series(vals).rolling(5, win_type=wt, min_periods=3).mean() - tm.assert_series_equal(xp, rs) + xp = Series(xps[win_types]) + rs = Series(vals).rolling(5, win_type=win_types, min_periods=3).mean() + tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy - def test_cmov_window_special(self): + def test_cmov_window_special(self, win_types_special): # GH 8238 - win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] - kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., - 'width': 2.}, {'width': 0.5}] + kwds = { + 'kaiser': {'beta': 1.}, + 'gaussian': {'std': 1.}, + 'general_gaussian': {'power': 2., 'width': 2.}, + 'slepian': {'width': 0.5}} vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) @@ -1043,17 +1077,20 @@ def test_cmov_window_special(self): 12.90702, 12.83757, np.nan, np.nan] } - for wt, k in zip(win_types, kwds): - xp = Series(xps[wt]) - rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k) - tm.assert_series_equal(xp, rs) + xp = Series(xps[win_types_special]) + rs = Series(vals).rolling( + 5, win_type=win_types_special, center=True).mean( + **kwds[win_types_special]) + tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy - def test_cmov_window_special_linear_range(self): + def test_cmov_window_special_linear_range(self, win_types_special): # GH 8238 - win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] - kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., - 'width': 2.}, {'width': 0.5}] + kwds = { + 'kaiser': {'beta': 1.}, + 'gaussian': {'std': 1.}, + 'general_gaussian': {'power': 2., 'width': 2.}, + 'slepian': {'width': 0.5}} vals = np.array(range(10), dtype=np.float) xp = vals.copy() @@ -1061,9 +1098,10 @@ def test_cmov_window_special_linear_range(self): xp[-2:] = np.nan xp = Series(xp) - for wt, k in zip(win_types, kwds): - rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k) - tm.assert_series_equal(xp, rs) + rs = Series(vals).rolling( + 5, win_type=win_types_special, center=True).mean( + **kwds[win_types_special]) + tm.assert_series_equal(xp, rs) def test_rolling_median(self): self._check_moment_func(np.median, name='median') @@ -1150,43 +1188,76 @@ def test_rolling_quantile_param(self): with pytest.raises(TypeError): ser.rolling(3).quantile('foo') - def test_rolling_apply(self): + def test_rolling_apply(self, raw): # suppress warnings about empty slices, as we are deliberately testing # with a 0-length Series + with warnings.catch_warnings(): warnings.filterwarnings("ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning) - ser = Series([]) - tm.assert_series_equal(ser, - ser.rolling(10).apply(lambda x: x.mean())) - def f(x): return x[np.isfinite(x)].mean() - self._check_moment_func(np.mean, name='apply', func=f) + self._check_moment_func(np.mean, name='apply', func=f, raw=raw) - # GH 8080 + expected = Series([]) + result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) + tm.assert_series_equal(result, expected) + + # gh-8080 s = Series([None, None, None]) - result = s.rolling(2, min_periods=0).apply(lambda x: len(x)) + result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) expected = Series([1., 2., 2.]) tm.assert_series_equal(result, expected) - result = s.rolling(2, min_periods=0).apply(len) + result = s.rolling(2, min_periods=0).apply(len, raw=raw) tm.assert_series_equal(result, expected) - def test_rolling_apply_out_of_bounds(self): - # #1850 + @pytest.mark.parametrize('klass', [Series, DataFrame]) + @pytest.mark.parametrize( + 'method', [lambda x: x.rolling(window=2), lambda x: x.expanding()]) + def test_apply_future_warning(self, klass, method): + + # gh-5071 + s = klass(np.arange(3)) + + with tm.assert_produces_warning(FutureWarning): + method(s).apply(lambda x: len(x)) + + def test_rolling_apply_out_of_bounds(self, raw): + # gh-1850 vals = pd.Series([1, 2, 3, 4]) - result = vals.rolling(10).apply(np.sum) + result = vals.rolling(10).apply(np.sum, raw=raw) assert result.isna().all() - result = vals.rolling(10, min_periods=1).apply(np.sum) + result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw) expected = pd.Series([1, 3, 6, 10], dtype=float) tm.assert_almost_equal(result, expected) + @pytest.mark.parametrize('window', [2, '2s']) + def test_rolling_apply_with_pandas_objects(self, window): + # 5071 + df = pd.DataFrame({'A': np.random.randn(5), + 'B': np.random.randint(0, 10, size=5)}, + index=pd.date_range('20130101', periods=5, freq='s')) + + # we have an equal spaced timeseries index + # so simulate removing the first period + def f(x): + if x.index[0] == df.index[0]: + return np.nan + return x.iloc[-1] + + result = df.rolling(window).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df) + tm.assert_frame_equal(result, expected) + + with pytest.raises(AttributeError): + df.rolling(window).apply(f, raw=True) + def test_rolling_std(self): self._check_moment_func(lambda x: np.std(x, ddof=1), name='std') @@ -1256,10 +1327,10 @@ def get_result(obj, window, min_periods=None, center=False): frame_result = get_result(self.frame, window=50) assert isinstance(frame_result, DataFrame) - tm.assert_series_equal(frame_result.iloc[-1, :], - self.frame.iloc[-50:, :].apply(static_comp, - axis=0), - check_names=False) + tm.assert_series_equal( + frame_result.iloc[-1, :], + self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw), + check_names=False) # check time_rule works if has_time_rule: @@ -1287,7 +1358,7 @@ def get_result(obj, window, min_periods=None, center=False): static_comp(trunc_series)) tm.assert_series_equal(frame_result.xs(last_date), - trunc_frame.apply(static_comp), + trunc_frame.apply(static_comp, raw=raw), check_names=False) # excluding NaNs correctly @@ -1402,26 +1473,20 @@ def test_ewma(self): result = vals.ewm(span=100, adjust=False).mean().sum() assert np.abs(result - 1) < 1e-2 + @pytest.mark.parametrize('adjust', [True, False]) + @pytest.mark.parametrize('ignore_na', [True, False]) + def test_ewma_cases(self, adjust, ignore_na): + # try adjust/ignore_na args matrix + s = Series([1.0, 2.0, 4.0, 8.0]) - expected = Series([1.0, 1.6, 2.736842, 4.923077]) - for f in [lambda s: s.ewm(com=2.0, adjust=True).mean(), - lambda s: s.ewm(com=2.0, adjust=True, - ignore_na=False).mean(), - lambda s: s.ewm(com=2.0, adjust=True, ignore_na=True).mean(), - ]: - result = f(s) - tm.assert_series_equal(result, expected) + if adjust: + expected = Series([1.0, 1.6, 2.736842, 4.923077]) + else: + expected = Series([1.0, 1.333333, 2.222222, 4.148148]) - expected = Series([1.0, 1.333333, 2.222222, 4.148148]) - for f in [lambda s: s.ewm(com=2.0, adjust=False).mean(), - lambda s: s.ewm(com=2.0, adjust=False, - ignore_na=False).mean(), - lambda s: s.ewm(com=2.0, adjust=False, - ignore_na=True).mean(), - ]: - result = f(s) - tm.assert_series_equal(result, expected) + result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() + tm.assert_series_equal(result, expected) def test_ewma_nan_handling(self): s = Series([1.] + [np.nan] * 5 + [1.]) @@ -1555,14 +1620,13 @@ def test_ewm_domain_checks(self): s.ewm(alpha=1.0) pytest.raises(ValueError, s.ewm, alpha=1.1) - def test_ew_empty_series(self): + @pytest.mark.parametrize('method', ['mean', 'vol', 'var']) + def test_ew_empty_series(self, method): vals = pd.Series([], dtype=np.float64) ewm = vals.ewm(3) - funcs = ['mean', 'vol', 'var'] - for f in funcs: - result = getattr(ewm, f)() - tm.assert_almost_equal(result, vals) + result = getattr(ewm, method)() + tm.assert_almost_equal(result, vals) def _check_ew(self, name=None, preserve_nan=False): series_result = getattr(self.series.ewm(com=10), name)() @@ -2160,7 +2224,7 @@ def test_expanding_consistency(self, min_periods): if name == 'count': expanding_f_result = expanding_f() expanding_apply_f_result = x.expanding( - min_periods=0).apply(func=f) + min_periods=0).apply(func=f, raw=True) else: if name in ['cov', 'corr']: expanding_f_result = expanding_f( @@ -2168,7 +2232,7 @@ def test_expanding_consistency(self, min_periods): else: expanding_f_result = expanding_f() expanding_apply_f_result = x.expanding( - min_periods=min_periods).apply(func=f) + min_periods=min_periods).apply(func=f, raw=True) # GH 9422 if name in ['sum', 'prod']: @@ -2259,7 +2323,7 @@ def test_rolling_consistency(self, window, min_periods, center): rolling_f_result = rolling_f() rolling_apply_f_result = x.rolling( window=window, min_periods=0, - center=center).apply(func=f) + center=center).apply(func=f, raw=True) else: if name in ['cov', 'corr']: rolling_f_result = rolling_f( @@ -2268,7 +2332,7 @@ def test_rolling_consistency(self, window, min_periods, center): rolling_f_result = rolling_f() rolling_apply_f_result = x.rolling( window=window, min_periods=min_periods, - center=center).apply(func=f) + center=center).apply(func=f, raw=True) # GH 9422 if name in ['sum', 'prod']: @@ -2348,29 +2412,25 @@ def test_corr_sanity(self): except AssertionError: print(res) - def test_flex_binary_frame(self): - def _check(method): - series = self.frame[1] + @pytest.mark.parametrize('method', ['corr', 'cov']) + def test_flex_binary_frame(self, method): + series = self.frame[1] - res = getattr(series.rolling(window=10), method)(self.frame) - res2 = getattr(self.frame.rolling(window=10), method)(series) - exp = self.frame.apply(lambda x: getattr( - series.rolling(window=10), method)(x)) + res = getattr(series.rolling(window=10), method)(self.frame) + res2 = getattr(self.frame.rolling(window=10), method)(series) + exp = self.frame.apply(lambda x: getattr( + series.rolling(window=10), method)(x)) - tm.assert_frame_equal(res, exp) - tm.assert_frame_equal(res2, exp) + tm.assert_frame_equal(res, exp) + tm.assert_frame_equal(res2, exp) - frame2 = self.frame.copy() - frame2.values[:] = np.random.randn(*frame2.shape) + frame2 = self.frame.copy() + frame2.values[:] = np.random.randn(*frame2.shape) - res3 = getattr(self.frame.rolling(window=10), method)(frame2) - exp = DataFrame(dict((k, getattr(self.frame[k].rolling( - window=10), method)(frame2[k])) for k in self.frame)) - tm.assert_frame_equal(res3, exp) - - methods = ['corr', 'cov'] - for meth in methods: - _check(meth) + res3 = getattr(self.frame.rolling(window=10), method)(frame2) + exp = DataFrame(dict((k, getattr(self.frame[k].rolling( + window=10), method)(frame2[k])) for k in self.frame)) + tm.assert_frame_equal(res3, exp) def test_ewmcov(self): self._check_binary_ew('cov') @@ -2417,19 +2477,24 @@ def func(A, B, com, **kwargs): pytest.raises(Exception, func, A, randn(50), 20, min_periods=5) - def test_expanding_apply_args_kwargs(self): + def test_expanding_apply_args_kwargs(self, raw): + def mean_w_arg(x, const): return np.mean(x) + const df = DataFrame(np.random.rand(20, 3)) - expected = df.expanding().apply(np.mean) + 20. + expected = df.expanding().apply(np.mean, raw=raw) + 20. - tm.assert_frame_equal(df.expanding().apply(mean_w_arg, args=(20, )), - expected) - tm.assert_frame_equal(df.expanding().apply(mean_w_arg, - kwargs={'const': 20}), - expected) + result = df.expanding().apply(mean_w_arg, + raw=raw, + args=(20, )) + tm.assert_frame_equal(result, expected) + + result = df.expanding().apply(mean_w_arg, + raw=raw, + kwargs={'const': 20}) + tm.assert_frame_equal(result, expected) def test_expanding_corr(self): A = self.series.dropna() @@ -2539,42 +2604,47 @@ def test_rolling_corr_diff_length(self): result = s1.rolling(window=3, min_periods=2).corr(s2a) tm.assert_series_equal(result, expected) - def test_rolling_functions_window_non_shrinkage(self): + @pytest.mark.parametrize( + 'f', + [ + lambda x: (x.rolling(window=10, min_periods=5) + .cov(x, pairwise=False)), + lambda x: (x.rolling(window=10, min_periods=5) + .corr(x, pairwise=False)), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling( + window=10, min_periods=5).quantile(quantile=0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply( + sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply( + sum, raw=True), + lambda x: x.rolling(win_type='boxcar', + window=10, min_periods=5).mean()]) + def test_rolling_functions_window_non_shrinkage(self, f): # GH 7764 s = Series(range(4)) s_expected = Series(np.nan, index=s.index) df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) - functions = [lambda x: (x.rolling(window=10, min_periods=5) - .cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5) - .corr(x, pairwise=False)), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling( - window=10, min_periods=5).quantile(quantile=0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum), - lambda x: x.rolling(win_type='boxcar', - window=10, min_periods=5).mean()] - for f in functions: - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) + try: + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) - df_result = f(df) - tm.assert_frame_equal(df_result, df_expected) - except (ImportError): + df_result = f(df) + tm.assert_frame_equal(df_result, df_expected) + except (ImportError): - # scipy needed for rolling_window - continue + # scipy needed for rolling_window + pytest.skip("scipy not available") def test_rolling_functions_window_non_shrinkage_binary(self): @@ -2620,7 +2690,10 @@ def test_moment_functions_zero_length(self): lambda x: x.expanding(min_periods=5).kurt(), lambda x: x.expanding(min_periods=5).quantile(0.5), lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply(sum), + lambda x: x.expanding(min_periods=5).apply( + sum, raw=False), + lambda x: x.expanding(min_periods=5).apply( + sum, raw=True), lambda x: x.rolling(window=10).count(), lambda x: x.rolling(window=10, min_periods=5).cov( x, pairwise=False), @@ -2637,7 +2710,10 @@ def test_moment_functions_zero_length(self): lambda x: x.rolling( window=10, min_periods=5).quantile(0.5), lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum), + lambda x: x.rolling(window=10, min_periods=5).apply( + sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply( + sum, raw=True), lambda x: x.rolling(win_type='boxcar', window=10, min_periods=5).mean(), ] @@ -2805,20 +2881,25 @@ def expanding_func(x, min_periods=1, center=False, axis=0): return getattr(exp, func)() self._check_expanding(expanding_func, static_comp, preserve_nan=False) - def test_expanding_apply(self): + def test_expanding_apply(self, raw): def expanding_mean(x, min_periods=1): + exp = x.expanding(min_periods=min_periods) - return exp.apply(lambda x: x.mean()) + result = exp.apply(lambda x: x.mean(), raw=raw) + return result - self._check_expanding(expanding_mean, np.mean) + # TODO(jreback), needed to add preserve_nan=False + # here to make this pass + self._check_expanding(expanding_mean, np.mean, preserve_nan=False) ser = Series([]) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean())) + tm.assert_series_equal(ser, ser.expanding().apply( + lambda x: x.mean(), raw=raw)) # GH 8080 s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x)) + result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) expected = Series([1., 2., 3.]) tm.assert_series_equal(result, expected) @@ -3057,13 +3138,14 @@ def func(x): expected = g.apply(func) tm.assert_series_equal(result, expected) - def test_rolling_apply(self): + def test_rolling_apply(self, raw): g = self.frame.groupby('A') r = g.rolling(window=4) # reduction - result = r.apply(lambda x: x.sum()) - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum())) + result = r.apply(lambda x: x.sum(), raw=raw) + expected = g.apply( + lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) def test_expanding(self): @@ -3104,13 +3186,14 @@ def func(x): expected = g.apply(func) tm.assert_series_equal(result, expected) - def test_expanding_apply(self): + def test_expanding_apply(self, raw): g = self.frame.groupby('A') r = g.expanding() # reduction - result = r.apply(lambda x: x.sum()) - expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum())) + result = r.apply(lambda x: x.sum(), raw=raw) + expected = g.apply( + lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) @@ -3624,22 +3707,22 @@ def test_ragged_max(self): expected['B'] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - def test_ragged_apply(self): + def test_ragged_apply(self, raw): df = self.ragged f = lambda x: 1 - result = df.rolling(window='1s', min_periods=1).apply(f) + result = df.rolling(window='1s', min_periods=1).apply(f, raw=raw) expected = df.copy() expected['B'] = 1. tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).apply(f) + result = df.rolling(window='2s', min_periods=1).apply(f, raw=raw) expected = df.copy() expected['B'] = 1. tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).apply(f) + result = df.rolling(window='5s', min_periods=1).apply(f, raw=raw) expected = df.copy() expected['B'] = 1. tm.assert_frame_equal(result, expected) @@ -3662,8 +3745,14 @@ def test_all(self): expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) - result = r.apply(lambda x: 1) - expected = er.apply(lambda x: 1) + def test_all_apply(self, raw): + + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window='1s') + + result = r.apply(lambda x: 1, raw=raw) + expected = er.apply(lambda x: 1, raw=raw) tm.assert_frame_equal(result, expected) def test_all2(self): From 6245e8c983a685a46e3b64d64aaa59afc4655ed6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 17 Apr 2018 09:53:32 +0200 Subject: [PATCH 10/33] TST: add tests for take() on empty arrays (#20582) --- pandas/core/arrays/base.py | 18 +++++++ pandas/tests/extension/base/getitem.py | 47 +++++++++++++++++++ .../extension/category/test_categorical.py | 13 +++++ pandas/tests/extension/decimal/array.py | 4 ++ pandas/tests/extension/json/array.py | 8 +++- 5 files changed, 88 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c281bd80cb274..d49a0d799526a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -458,11 +458,23 @@ def take(self, indexer, allow_fill=True, fill_value=None): Fill value to replace -1 values with. If applicable, this should use the sentinel missing value for this type. + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indexer is out of bounds for the array. + Notes ----- This should follow pandas' semantics where -1 indicates missing values. Positions where indexer is ``-1`` should be filled with the missing value for this type. + This gives rise to the special case of a take on an empty + ExtensionArray that does not raises an IndexError straight away + when the `indexer` is all ``-1``. This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the indexer is a sequence of values. @@ -477,6 +489,12 @@ def take(self, indexer, allow_fill=True, fill_value=None): def take(self, indexer, allow_fill=True, fill_value=None): indexer = np.asarray(indexer) mask = indexer == -1 + + # take on empty array not handled as desired by numpy + # in case of -1 (all missing take) + if not len(self) and mask.all(): + return type(self)([np.nan] * len(indexer)) + result = self.data.take(indexer) result[mask] = np.nan # NA for this type return type(self)(result) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 566ba1721d13c..4e2a65eba06dc 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -1,6 +1,8 @@ +import pytest import numpy as np import pandas as pd +import pandas.util.testing as tm from .base import BaseExtensionTests @@ -120,3 +122,48 @@ def test_take_sequence(self, data): assert result.iloc[0] == data[0] assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] + + def test_take(self, data, na_value, na_cmp): + result = data.take([0, -1]) + assert result.dtype == data.dtype + assert result[0] == data[0] + na_cmp(result[1], na_value) + + with tm.assert_raises_regex(IndexError, "out of bounds"): + data.take([len(data) + 1]) + + def test_take_empty(self, data, na_value, na_cmp): + empty = data[:0] + result = empty.take([-1]) + na_cmp(result[0], na_value) + + with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"): + empty.take([0, 1]) + + @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") + def test_take_series(self, data): + s = pd.Series(data) + result = s.take([0, -1]) + expected = pd.Series( + data._constructor_from_sequence([data[0], data[len(data) - 1]]), + index=[0, len(data) - 1]) + self.assert_series_equal(result, expected) + + def test_reindex(self, data, na_value): + s = pd.Series(data) + result = s.reindex([0, 1, 3]) + expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) + self.assert_series_equal(result, expected) + + n = len(data) + result = s.reindex([-1, 0, n]) + expected = pd.Series( + data._constructor_from_sequence([na_value, data[0], na_value]), + index=[-1, 0, n]) + self.assert_series_equal(result, expected) + + result = s.reindex([n, n + 1]) + expected = pd.Series( + data._constructor_from_sequence([na_value, na_value]), + index=[n, n + 1]) + self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 6abf1f7f9a65a..27c156c15203f 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -84,6 +84,19 @@ def test_getitem_scalar(self): # to break things by changing. pass + @pytest.mark.xfail(reason="Categorical.take buggy") + def test_take(self): + # TODO remove this once Categorical.take is fixed + pass + + @pytest.mark.xfail(reason="Categorical.take buggy") + def test_take_empty(self): + pass + + @pytest.mark.xfail(reason="test not written correctly for categorical") + def test_reindex(self): + pass + class TestSetitem(base.BaseSetitemTests): pass diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index f93d11f579f11..a8e88365b5648 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -81,6 +81,10 @@ def take(self, indexer, allow_fill=True, fill_value=None): indexer = np.asarray(indexer) mask = indexer == -1 + # take on empty array not handled as desired by numpy in case of -1 + if not len(self) and mask.all(): + return type(self)([self._na_value] * len(indexer)) + indexer = _ensure_platform_int(indexer) out = self.values.take(indexer) out[mask] = self._na_value diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d9ae49d87804a..33843492cb706 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -89,8 +89,12 @@ def isna(self): return np.array([x == self._na_value for x in self.data]) def take(self, indexer, allow_fill=True, fill_value=None): - output = [self.data[loc] if loc != -1 else self._na_value - for loc in indexer] + try: + output = [self.data[loc] if loc != -1 else self._na_value + for loc in indexer] + except IndexError: + raise IndexError("Index is out of bounds or cannot do a " + "non-empty take from an empty array.") return self._constructor_from_sequence(output) def copy(self, deep=False): From 75295e16dbd449c29609ec6e3e09087df977744b Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Tue, 17 Apr 2018 11:31:42 +0100 Subject: [PATCH 11/33] CLN: Replacing %s with .format in pandas/core/frame.py (#20461) --- pandas/core/frame.py | 84 ++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f476bff4df2cd..a71ade3da87de 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -411,7 +411,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as e: exc = TypeError('DataFrame constructor called with ' - 'incompatible data and dtype: %s' % e) + 'incompatible data and dtype: {e}'.format(e=e)) raise_with_traceback(exc) if arr.ndim == 0 and index is not None and columns is not None: @@ -520,8 +520,9 @@ def _get_axes(N, K, index=index, columns=columns): try: values = values.astype(dtype) except Exception as orig: - e = ValueError("failed to cast to '%s' (Exception was: %s)" - % (dtype, orig)) + e = ValueError("failed to cast to '{dtype}' (Exception " + "was: {orig})".format(dtype=dtype, + orig=orig)) raise_with_traceback(e) index, columns = _get_axes(*values.shape) @@ -873,8 +874,9 @@ def dot(self, other): lvals = self.values rvals = np.asarray(other) if lvals.shape[1] != rvals.shape[0]: - raise ValueError('Dot product shape mismatch, %s vs %s' % - (lvals.shape, rvals.shape)) + raise ValueError('Dot product shape mismatch, ' + '{l} vs {r}'.format(l=lvals.shape, + r=rvals.shape)) if isinstance(other, DataFrame): return self._constructor(np.dot(lvals, rvals), index=left.index, @@ -888,7 +890,7 @@ def dot(self, other): else: return Series(result, index=left.index) else: # pragma: no cover - raise TypeError('unsupported type: %s' % type(other)) + raise TypeError('unsupported type: {oth}'.format(oth=type(other))) def __matmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5 """ @@ -1098,7 +1100,7 @@ def to_dict(self, orient='dict', into=dict): return into_c((t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples()) else: - raise ValueError("orient '%s' not understood" % orient) + raise ValueError("orient '{o}' not understood".format(o=orient)) def to_gbq(self, destination_table, project_id, chunksize=None, verbose=None, reauth=False, if_exists='fail', private_key=None, @@ -2140,7 +2142,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, lines.append(self.index._summary()) if len(self.columns) == 0: - lines.append('Empty %s' % type(self).__name__) + lines.append('Empty {name}'.format(name=type(self).__name__)) fmt.buffer_put_lines(buf, lines) return @@ -2166,13 +2168,15 @@ def _verbose_repr(): space = max(len(pprint_thing(k)) for k in self.columns) + 4 counts = None - tmpl = "%s%s" + tmpl = "{count}{dtype}" if show_counts: counts = self.count() if len(cols) != len(counts): # pragma: no cover - raise AssertionError('Columns must equal counts (%d != %d)' - % (len(cols), len(counts))) - tmpl = "%s non-null %s" + raise AssertionError( + 'Columns must equal counts ' + '({cols:d} != {counts:d})'.format( + cols=len(cols), counts=len(counts))) + tmpl = "{count} non-null {dtype}" dtypes = self.dtypes for i, col in enumerate(self.columns): @@ -2183,7 +2187,8 @@ def _verbose_repr(): if show_counts: count = counts.iloc[i] - lines.append(_put_str(col, space) + tmpl % (count, dtype)) + lines.append(_put_str(col, space) + tmpl.format(count=count, + dtype=dtype)) def _non_verbose_repr(): lines.append(self.columns._summary(name='Columns')) @@ -2192,9 +2197,12 @@ def _sizeof_fmt(num, size_qualifier): # returns size in human readable format for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: if num < 1024.0: - return "%3.1f%s %s" % (num, size_qualifier, x) + return ("{num:3.1f}{size_q}" + "{x}".format(num=num, size_q=size_qualifier, x=x)) num /= 1024.0 - return "%3.1f%s %s" % (num, size_qualifier, 'PB') + return "{num:3.1f}{size_q} {pb}".format(num=num, + size_q=size_qualifier, + pb='PB') if verbose: _verbose_repr() @@ -2207,8 +2215,9 @@ def _sizeof_fmt(num, size_qualifier): _verbose_repr() counts = self.get_dtype_counts() - dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))] - lines.append('dtypes: %s' % ', '.join(dtypes)) + dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k + in sorted(compat.iteritems(counts))] + lines.append('dtypes: {types}'.format(types=', '.join(dtypes))) if memory_usage is None: memory_usage = get_option('display.memory_usage') @@ -2226,8 +2235,9 @@ def _sizeof_fmt(num, size_qualifier): self.index._is_memory_usage_qualified()): size_qualifier = '+' mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append("memory usage: %s\n" % - _sizeof_fmt(mem_usage, size_qualifier)) + lines.append("memory usage: {mem}\n".format( + mem=_sizeof_fmt(mem_usage, size_qualifier))) + fmt.buffer_put_lines(buf, lines) def memory_usage(self, index=True, deep=False): @@ -3013,8 +3023,8 @@ def select_dtypes(self, include=None, exclude=None): # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError('include and exclude overlap on %s' % - (include & exclude)) + raise ValueError('include and exclude overlap on {inc_ex}'.format( + inc_ex=(include & exclude))) # empty include/exclude -> defaults to True # three cases (we've already raised if both are empty) @@ -3869,7 +3879,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if verify_integrity and not index.is_unique: duplicates = index.get_duplicates() - raise ValueError('Index has duplicate keys: %s' % duplicates) + raise ValueError('Index has duplicate keys: {dup}'.format( + dup=duplicates)) for c in to_remove: del frame[c] @@ -4241,7 +4252,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, mask = count > 0 else: if how is not None: - raise ValueError('invalid how option: %s' % how) + raise ValueError('invalid how option: {h}'.format(h=how)) else: raise TypeError('must specify how or thresh') @@ -6750,8 +6761,8 @@ def _count_level(self, level, axis=0, numeric_only=False): agg_axis = frame._get_agg_axis(axis) if not isinstance(count_axis, MultiIndex): - raise TypeError("Can only count levels on hierarchical %s." % - self._get_axis_name(axis)) + raise TypeError("Can only count levels on hierarchical " + "{ax}.".format(ax=self._get_axis_name(axis))) if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might @@ -6829,9 +6840,9 @@ def f(x): elif filter_type == 'bool': data = self._get_bool_data() else: # pragma: no cover - e = NotImplementedError("Handling exception with filter_" - "type %s not implemented." % - filter_type) + e = NotImplementedError( + "Handling exception with filter_type {f} not" + "implemented.".format(f=filter_type)) raise_with_traceback(e) with np.errstate(all='ignore'): result = f(data.values) @@ -6843,8 +6854,8 @@ def f(x): elif filter_type == 'bool': data = self._get_bool_data() else: # pragma: no cover - msg = ("Generating numeric_only data with filter_type %s" - "not supported." % filter_type) + msg = ("Generating numeric_only data with filter_type {f}" + "not supported.".format(f=filter_type)) raise NotImplementedError(msg) values = data.values labels = data._get_agg_axis(axis) @@ -7119,7 +7130,8 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) else: # pragma: no cover - raise AssertionError('Axis must be 0 or 1. Got %s' % str(axis)) + raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format( + ax=axis)) return self._constructor(new_data) @@ -7150,7 +7162,8 @@ def to_period(self, freq=None, axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_period(freq=freq)) else: # pragma: no cover - raise AssertionError('Axis must be 0 or 1. Got %s' % str(axis)) + raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format( + ax=axis)) return self._constructor(new_data) @@ -7509,8 +7522,9 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): else: if len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... - raise AssertionError('%d columns passed, passed data had %s ' - 'columns' % (len(columns), len(content))) + raise AssertionError('{col:d} columns passed, passed data had ' + '{con} columns'.format(col=len(columns), + con=len(content))) # provide soft conversion of object dtypes def convert(arr): @@ -7585,4 +7599,4 @@ def _from_nested_dict(data): def _put_str(s, space): - return ('%s' % s)[:space].ljust(space) + return u'{s}'.format(s=s)[:space].ljust(space) From bb095a6e96217f162544b10e9e7a46f04071fb37 Mon Sep 17 00:00:00 2001 From: Zihao Zhao Date: Tue, 17 Apr 2018 03:34:13 -0700 Subject: [PATCH 12/33] change the indent for the pydoc of apply() function. (#20715) --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d6f770d92795..2ed4c99b7a998 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3130,7 +3130,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): >>> def add_custom_values(x, **kwargs): ... for month in kwargs: ... x+=kwargs[month] - ... return x + ... return x >>> series.apply(add_custom_values, june=30, july=20, august=25) London 95 From 7ed1f5371601f3300c8b4592c87159fb3eaec5cd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 18 Apr 2018 10:52:02 +0200 Subject: [PATCH 13/33] PKG: remove pyproject.toml for now (#20718) --- pyproject.toml | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index f0d57d1d808a2..0000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,9 +0,0 @@ -[build-system] -requires = [ - "wheel", - "setuptools", - "Cython", # required for VCS build, optional for released source - "numpy==1.9.3; python_version=='3.5'", - "numpy==1.12.1; python_version=='3.6'", - "numpy==1.13.1; python_version>='3.7'", -] From b9f826f46d9ec9871a00f2d2a95a0e13f520483e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Apr 2018 12:07:34 +0200 Subject: [PATCH 14/33] DOC: use apply(raw=True) in docs to silence warning (#20741) --- doc/source/computation.rst | 2 +- doc/source/cookbook.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 4285767654e25..ff06c369e1897 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -323,7 +323,7 @@ compute the mean absolute deviation on a rolling basis: mad = lambda x: np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - s.rolling(window=60).apply(mad).plot(style='k') + s.rolling(window=60).apply(mad, raw=True).plot(style='k') .. _stats.rolling_window: diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 4e61228d5c0ad..893642410af02 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -496,7 +496,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to def Red(x): return functools.reduce(CumRet,x,1.0) - S.expanding().apply(Red) + S.expanding().apply(Red, raw=True) `Replacing some values with mean of the rest of a group From 07739aadda4a9afda31fe9ab5d7b01d19f3f1199 Mon Sep 17 00:00:00 2001 From: Graham Inggs Date: Thu, 19 Apr 2018 12:12:48 +0200 Subject: [PATCH 15/33] Fix more tests expecting little-endian (#20738) As in #14832, use = (native) instead of < (little-endian) --- pandas/tests/dtypes/test_cast.py | 4 ++-- pandas/tests/frame/test_convert_to.py | 2 +- pandas/tests/indexes/period/test_formats.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 590f28b275aec..20cd8b43478d2 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -191,9 +191,9 @@ def testinfer_dtype_from_scalar_errors(self): (pd.Categorical(list('aabc')), 'category', True), (pd.Categorical([1, 2, 3]), 'category', True), (Timestamp('20160101'), np.object_, False), - (np.datetime64('2016-01-01'), np.dtype(' Date: Thu, 19 Apr 2018 03:15:32 -0700 Subject: [PATCH 16/33] DOC: add coverage href to README.md (#20736) --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 86cf95508a5d9..36323410854b0 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,11 @@ Coverage - coverage +   + +   coverage +     + Conda From 78fee04e95e3c53c83c938285580c39e7761ddc8 Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 19 Apr 2018 06:43:20 -0600 Subject: [PATCH 17/33] DEPR: Deprecate DatetimeIndex.offset in favor of DatetimeIndex.freq (#20730) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexes/datetimes.py | 122 ++++++++++-------- .../indexes/datetimes/test_construction.py | 6 +- .../indexes/datetimes/test_date_range.py | 32 ++--- .../tests/indexes/datetimes/test_indexing.py | 8 +- pandas/tests/indexes/datetimes/test_ops.py | 28 ++-- pandas/tests/indexes/datetimes/test_setops.py | 2 +- 7 files changed, 114 insertions(+), 85 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 641214550a3b7..2975b2a53c3a8 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -877,6 +877,7 @@ Deprecations - The ``convert_datetime64`` parameter in :func:`DataFrame.to_records` has been deprecated and will be removed in a future version. The NumPy bug motivating this parameter has been resolved. The default value for this parameter has also changed from ``True`` to ``None`` (:issue:`18160`). - :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`) +- ``DatetimeIndex.offset`` is deprecated. Use ``DatetimeIndex.freq`` instead (:issue:`20716`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 88ea3511d4ee3..e0e7ba3e8b518 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -302,7 +302,7 @@ def _add_comparison_methods(cls): _engine_type = libindex.DatetimeEngine tz = None - offset = None + _freq = None _comparables = ['name', 'freqstr', 'tz'] _attributes = ['name', 'freq', 'tz'] @@ -415,7 +415,7 @@ def __new__(cls, data=None, subarr = data.values if freq is None: - freq = data.offset + freq = data.freq verify_integrity = False else: if data.dtype != _NS_DTYPE: @@ -467,12 +467,12 @@ def __new__(cls, data=None, if freq_infer: inferred = subarr.inferred_freq if inferred: - subarr.offset = to_offset(inferred) + subarr.freq = to_offset(inferred) return subarr._deepcopy_if_needed(ref_to_data, copy) @classmethod - def _generate(cls, start, end, periods, name, offset, + def _generate(cls, start, end, periods, name, freq, tz=None, normalize=False, ambiguous='raise', closed=None): if com._count_not_none(start, end, periods) != 2: raise ValueError('Of the three parameters: start, end, and ' @@ -535,7 +535,7 @@ def _generate(cls, start, end, periods, name, offset, else: _normalized = _normalized and end.time() == _midnight - if hasattr(offset, 'delta') and offset != offsets.Day(): + if hasattr(freq, 'delta') and freq != offsets.Day(): if inferred_tz is None and tz is not None: # naive dates if start is not None and start.tz is None: @@ -551,11 +551,11 @@ def _generate(cls, start, end, periods, name, offset, if end.tz is None and start.tz is not None: end = end.tz_localize(start.tz, ambiguous=False) - if _use_cached_range(offset, _normalized, start, end): + if _use_cached_range(freq, _normalized, start, end): index = cls._cached_range(start, end, periods=periods, - offset=offset, name=name) + freq=freq, name=name) else: - index = _generate_regular_range(start, end, periods, offset) + index = _generate_regular_range(start, end, periods, freq) else: @@ -574,11 +574,11 @@ def _generate(cls, start, end, periods, name, offset, if end.tz is None and start.tz is not None: start = start.replace(tzinfo=None) - if _use_cached_range(offset, _normalized, start, end): + if _use_cached_range(freq, _normalized, start, end): index = cls._cached_range(start, end, periods=periods, - offset=offset, name=name) + freq=freq, name=name) else: - index = _generate_regular_range(start, end, periods, offset) + index = _generate_regular_range(start, end, periods, freq) if tz is not None and getattr(index, 'tz', None) is None: index = conversion.tz_localize_to_utc(_ensure_int64(index), tz, @@ -596,12 +596,12 @@ def _generate(cls, start, end, periods, name, offset, index = index[1:] if not right_closed and len(index) and index[-1] == end: index = index[:-1] - index = cls._simple_new(index, name=name, freq=offset, tz=tz) + index = cls._simple_new(index, name=name, freq=freq, tz=tz) return index @property def _box_func(self): - return lambda x: Timestamp(x, freq=self.offset, tz=self.tz) + return lambda x: Timestamp(x, freq=self.freq, tz=self.tz) def _convert_for_op(self, value): """ Convert value to be insertable to ndarray """ @@ -647,7 +647,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, result = object.__new__(cls) result._data = values result.name = name - result.offset = freq + result._freq = freq result._tz = timezones.maybe_get_tz(tz) result._tz = timezones.tz_standardize(result._tz) result._reset_identity() @@ -734,7 +734,7 @@ def _has_same_tz(self, other): return zzone == vzone @classmethod - def _cached_range(cls, start=None, end=None, periods=None, offset=None, + def _cached_range(cls, start=None, end=None, periods=None, freq=None, name=None): if start is None and end is None: # I somewhat believe this should never be raised externally @@ -747,30 +747,30 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, raise TypeError( 'Must either specify period or provide both start and end.') - if offset is None: + if freq is None: # This can't happen with external-facing code - raise TypeError('Must provide offset.') + raise TypeError('Must provide freq.') drc = _daterange_cache - if offset not in _daterange_cache: - xdr = generate_range(offset=offset, start=_CACHE_START, + if freq not in _daterange_cache: + xdr = generate_range(offset=freq, start=_CACHE_START, end=_CACHE_END) arr = tools.to_datetime(list(xdr), box=False) cachedRange = DatetimeIndex._simple_new(arr) - cachedRange.offset = offset + cachedRange.freq = freq cachedRange = cachedRange.tz_localize(None) cachedRange.name = None - drc[offset] = cachedRange + drc[freq] = cachedRange else: - cachedRange = drc[offset] + cachedRange = drc[freq] if start is None: if not isinstance(end, Timestamp): raise AssertionError('end must be an instance of Timestamp') - end = offset.rollback(end) + end = freq.rollback(end) endLoc = cachedRange.get_loc(end) + 1 startLoc = endLoc - periods @@ -778,23 +778,23 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, if not isinstance(start, Timestamp): raise AssertionError('start must be an instance of Timestamp') - start = offset.rollforward(start) + start = freq.rollforward(start) startLoc = cachedRange.get_loc(start) endLoc = startLoc + periods else: - if not offset.onOffset(start): - start = offset.rollforward(start) + if not freq.onOffset(start): + start = freq.rollforward(start) - if not offset.onOffset(end): - end = offset.rollback(end) + if not freq.onOffset(end): + end = freq.rollback(end) startLoc = cachedRange.get_loc(start) endLoc = cachedRange.get_loc(end) + 1 indexSlice = cachedRange[startLoc:endLoc] indexSlice.name = name - indexSlice.offset = offset + indexSlice.freq = freq return indexSlice @@ -836,7 +836,7 @@ def __setstate__(self, state): np.ndarray.__setstate__(data, nd_state) self.name = own_state[0] - self.offset = own_state[1] + self.freq = own_state[1] self._tz = timezones.tz_standardize(own_state[2]) # provide numpy < 1.7 compat @@ -1184,7 +1184,7 @@ def union(self, other): result._tz = timezones.tz_standardize(this.tz) if (result.freq is None and (this.freq is not None or other.freq is not None)): - result.offset = to_offset(result.inferred_freq) + result.freq = to_offset(result.inferred_freq) return result def to_perioddelta(self, freq): @@ -1232,7 +1232,7 @@ def union_many(self, others): this._tz = timezones.tz_standardize(tz) if this.freq is None: - this.offset = to_offset(this.inferred_freq) + this.freq = to_offset(this.inferred_freq) return this def join(self, other, how='left', level=None, return_indexers=False, @@ -1271,7 +1271,7 @@ def _maybe_utc_convert(self, other): def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None if (isinstance(other, DatetimeIndex) and - self.offset == other.offset and + self.freq == other.freq and self._can_fast_union(other)): joined = self._shallow_copy(joined) joined.name = name @@ -1284,9 +1284,9 @@ def _can_fast_union(self, other): if not isinstance(other, DatetimeIndex): return False - offset = self.offset + freq = self.freq - if offset is None or offset != other.offset: + if freq is None or freq != other.freq: return False if not self.is_monotonic or not other.is_monotonic: @@ -1306,10 +1306,10 @@ def _can_fast_union(self, other): # Only need to "adjoin", not overlap try: - return (right_start == left_end + offset) or right_start in left + return (right_start == left_end + freq) or right_start in left except (ValueError): - # if we are comparing an offset that does not propagate timezones + # if we are comparing a freq that does not propagate timezones # this will raise return False @@ -1329,7 +1329,7 @@ def _fast_union(self, other): left_start, left_end = left[0], left[-1] right_end = right[-1] - if not self.offset._should_cache(): + if not self.freq._should_cache(): # concatenate dates if left_end < right_end: loc = right.searchsorted(left_end, side='right') @@ -1341,7 +1341,7 @@ def _fast_union(self, other): else: return type(self)(start=left_start, end=max(left_end, right_end), - freq=left.offset) + freq=left.freq) def __iter__(self): """ @@ -1393,18 +1393,18 @@ def intersection(self, other): result = Index.intersection(self, other) if isinstance(result, DatetimeIndex): if result.freq is None: - result.offset = to_offset(result.inferred_freq) + result.freq = to_offset(result.inferred_freq) return result - elif (other.offset is None or self.offset is None or - other.offset != self.offset or - not other.offset.isAnchored() or + elif (other.freq is None or self.freq is None or + other.freq != self.freq or + not other.freq.isAnchored() or (not self.is_monotonic or not other.is_monotonic)): result = Index.intersection(self, other) result = self._shallow_copy(result._values, name=result.name, tz=result.tz, freq=None) if result.freq is None: - result.offset = to_offset(result.inferred_freq) + result.freq = to_offset(result.inferred_freq) return result if len(self) == 0: @@ -1729,12 +1729,28 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): @property def freq(self): """get/set the frequency of the Index""" - return self.offset + return self._freq @freq.setter def freq(self, value): """get/set the frequency of the Index""" - self.offset = value + self._freq = value + + @property + def offset(self): + """get/set the frequency of the Index""" + msg = ('DatetimeIndex.offset has been deprecated and will be removed ' + 'in a future version; use DatetimeIndex.freq instead.') + warnings.warn(msg, FutureWarning, stacklevel=2) + return self.freq + + @offset.setter + def offset(self, value): + """get/set the frequency of the Index""" + msg = ('DatetimeIndex.offset has been deprecated and will be removed ' + 'in a future version; use DatetimeIndex.freq instead.') + warnings.warn(msg, FutureWarning, stacklevel=2) + self.freq = value year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', @@ -2525,9 +2541,9 @@ def day_name(self, locale=None): DatetimeIndex._add_datetimelike_methods() -def _generate_regular_range(start, end, periods, offset): - if isinstance(offset, Tick): - stride = offset.nanos +def _generate_regular_range(start, end, periods, freq): + if isinstance(freq, Tick): + stride = freq.nanos if periods is None: b = Timestamp(start).value # cannot just use e = Timestamp(end) + 1 because arange breaks when @@ -2558,7 +2574,7 @@ def _generate_regular_range(start, end, periods, offset): end = end.to_pydatetime() xdr = generate_range(start=start, end=end, - periods=periods, offset=offset) + periods=periods, offset=freq) dates = list(xdr) # utc = len(dates) > 0 and dates[0].tzinfo is not None @@ -2855,9 +2871,9 @@ def _in_range(start, end, rng_start, rng_end): return start > rng_start and end < rng_end -def _use_cached_range(offset, _normalized, start, end): - return (offset._should_cache() and - not (offset._normalize_cache and not _normalized) and +def _use_cached_range(freq, _normalized, start, end): + return (freq._should_cache() and + not (freq._normalize_cache and not _normalized) and _naive_in_cache_range(start, end)) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 1cf854ad4a926..dae69a86910af 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -598,16 +598,16 @@ def test_datetimeindex_constructor_misc(self): idx2 = DatetimeIndex(start=sdate, end=edate, freq=offsets.Week(weekday=6)) assert len(idx1) == len(idx2) - assert idx1.offset == idx2.offset + assert idx1.freq == idx2.freq idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') idx2 = DatetimeIndex(start=sdate, end=edate, freq=offsets.QuarterBegin(startingMonth=1)) assert len(idx1) == len(idx2) - assert idx1.offset == idx2.offset + assert idx1.freq == idx2.freq idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') idx2 = DatetimeIndex(start=sdate, end=edate, freq=offsets.BQuarterEnd(startingMonth=12)) assert len(idx1) == len(idx2) - assert idx1.offset == idx2.offset + assert idx1.freq == idx2.freq diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index d2ec465468dfb..2dfd4ae3e6e3a 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -331,21 +331,21 @@ def test_naive_aware_conflicts(self): aware.join(naive) def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=BDay()) - DatetimeIndex._cached_range(START, periods=20, offset=BDay()) - DatetimeIndex._cached_range(end=START, periods=20, offset=BDay()) + DatetimeIndex._cached_range(START, END, freq=BDay()) + DatetimeIndex._cached_range(START, periods=20, freq=BDay()) + DatetimeIndex._cached_range(end=START, periods=20, freq=BDay()) - with tm.assert_raises_regex(TypeError, "offset"): + with tm.assert_raises_regex(TypeError, "freq"): DatetimeIndex._cached_range(START, END) with tm.assert_raises_regex(TypeError, "specify period"): - DatetimeIndex._cached_range(START, offset=BDay()) + DatetimeIndex._cached_range(START, freq=BDay()) with tm.assert_raises_regex(TypeError, "specify period"): - DatetimeIndex._cached_range(end=END, offset=BDay()) + DatetimeIndex._cached_range(end=END, freq=BDay()) with tm.assert_raises_regex(TypeError, "start or end"): - DatetimeIndex._cached_range(periods=20, offset=BDay()) + DatetimeIndex._cached_range(periods=20, freq=BDay()) def test_cached_range_bug(self): rng = date_range('2010-09-01 05:00:00', periods=50, @@ -393,7 +393,7 @@ def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range('12/5/2011', '12/5/2011') rng2 = bdate_range('12/2/2011', '12/5/2011') - rng2.offset = BDay() + rng2.freq = BDay() result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) @@ -605,27 +605,27 @@ def test_constructor(self): bdate_range('2011-1-1', '2012-1-1', 'C') def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=CDay()) + DatetimeIndex._cached_range(START, END, freq=CDay()) DatetimeIndex._cached_range(START, periods=20, - offset=CDay()) + freq=CDay()) DatetimeIndex._cached_range(end=START, periods=20, - offset=CDay()) + freq=CDay()) # with pytest.raises(TypeError): - with tm.assert_raises_regex(TypeError, "offset"): + with tm.assert_raises_regex(TypeError, "freq"): DatetimeIndex._cached_range(START, END) # with pytest.raises(TypeError): with tm.assert_raises_regex(TypeError, "specify period"): - DatetimeIndex._cached_range(START, offset=CDay()) + DatetimeIndex._cached_range(START, freq=CDay()) # with pytest.raises(TypeError): with tm.assert_raises_regex(TypeError, "specify period"): - DatetimeIndex._cached_range(end=END, offset=CDay()) + DatetimeIndex._cached_range(end=END, freq=CDay()) # with pytest.raises(TypeError): with tm.assert_raises_regex(TypeError, "start or end"): - DatetimeIndex._cached_range(periods=20, offset=CDay()) + DatetimeIndex._cached_range(periods=20, freq=CDay()) def test_misc(self): end = datetime(2009, 5, 13) @@ -640,7 +640,7 @@ def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range('12/5/2011', '12/5/2011', freq='C') rng2 = bdate_range('12/2/2011', '12/5/2011', freq='C') - rng2.offset = CDay() + rng2.freq = CDay() result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index af65a8618d30f..dd192db4b0eb3 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -53,10 +53,10 @@ def test_dti_business_getitem(self): exp = DatetimeIndex(rng.view(np.ndarray)[:5]) tm.assert_index_equal(smaller, exp) - assert smaller.offset == rng.offset + assert smaller.freq == rng.freq sliced = rng[::5] - assert sliced.offset == BDay() * 5 + assert sliced.freq == BDay() * 5 fancy_indexed = rng[[4, 3, 2, 1, 0]] assert len(fancy_indexed) == 5 @@ -77,10 +77,10 @@ def test_dti_custom_getitem(self): smaller = rng[:5] exp = DatetimeIndex(rng.view(np.ndarray)[:5]) tm.assert_index_equal(smaller, exp) - assert smaller.offset == rng.offset + assert smaller.freq == rng.freq sliced = rng[::5] - assert sliced.offset == CDay() * 5 + assert sliced.freq == CDay() * 5 fancy_indexed = rng[[4, 3, 2, 1, 0]] assert len(fancy_indexed) == 5 diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 8986828399a98..3c7d5d37e98f3 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -405,6 +405,18 @@ def test_equals(self): assert not idx.equals(list(idx3)) assert not idx.equals(pd.Series(idx3)) + def test_offset_deprecated(self): + # GH 20716 + idx = pd.DatetimeIndex(['20180101', '20180102']) + + # getter deprecated + with tm.assert_produces_warning(FutureWarning): + idx.offset + + # setter deprecated + with tm.assert_produces_warning(FutureWarning): + idx.offset = BDay() + class TestBusinessDatetimeIndex(object): @@ -420,7 +432,7 @@ def test_comparison(self): def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) - assert unpickled.offset is not None + assert unpickled.freq is not None def test_copy(self): cp = self.rng.copy() @@ -430,15 +442,15 @@ def test_copy(self): def test_shift(self): shifted = self.rng.shift(5) assert shifted[0] == self.rng[5] - assert shifted.offset == self.rng.offset + assert shifted.freq == self.rng.freq shifted = self.rng.shift(-5) assert shifted[5] == self.rng[0] - assert shifted.offset == self.rng.offset + assert shifted.freq == self.rng.freq shifted = self.rng.shift(0) assert shifted[0] == self.rng[0] - assert shifted.offset == self.rng.offset + assert shifted.freq == self.rng.freq rng = date_range(START, END, freq=BMonthEnd()) shifted = rng.shift(1, freq=BDay()) @@ -485,15 +497,15 @@ def test_shift(self): shifted = self.rng.shift(5) assert shifted[0] == self.rng[5] - assert shifted.offset == self.rng.offset + assert shifted.freq == self.rng.freq shifted = self.rng.shift(-5) assert shifted[5] == self.rng[0] - assert shifted.offset == self.rng.offset + assert shifted.freq == self.rng.freq shifted = self.rng.shift(0) assert shifted[0] == self.rng[0] - assert shifted.offset == self.rng.offset + assert shifted.freq == self.rng.freq # PerformanceWarning with warnings.catch_warnings(record=True): @@ -503,7 +515,7 @@ def test_shift(self): def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) - assert unpickled.offset is not None + assert unpickled.freq is not None def test_equals(self): assert not self.rng.equals(list(self.rng)) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 84632e59e2bfb..cb9364edc0cc3 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -357,7 +357,7 @@ def test_intersection(self): expected = rng[10:25] tm.assert_index_equal(the_int, expected) assert isinstance(the_int, DatetimeIndex) - assert the_int.offset == rng.offset + assert the_int.freq == rng.freq the_int = rng1.intersection(rng2.view(DatetimeIndex)) tm.assert_index_equal(the_int, expected) From 3e691a4cba566472bef03ef9bbaec701498670e1 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 20 Apr 2018 01:35:13 +0100 Subject: [PATCH 18/33] ENH: DataFrame.append preserves columns dtype if possible (#19021) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 7 +- pandas/tests/reshape/test_concat.py | 100 +++++++++++++++++++++++++++- pandas/tests/reshape/test_pivot.py | 9 ++- 4 files changed, 109 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 2975b2a53c3a8..e9a4ec9328a9b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -409,6 +409,7 @@ Other Enhancements - :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`) - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) - :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) +- :meth:`DataFrame.append` can now in more cases preserve the type of the calling dataframe's columns (e.g. if both are ``CategoricalIndex``) (:issue:`18359`) - :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) - ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`) - ``Categorical.rename_categories``, ``CategoricalIndex.rename_categories`` and :attr:`Series.cat.rename_categories` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a71ade3da87de..41d67c15c55b5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6138,8 +6138,11 @@ def append(self, other, ignore_index=False, verify_integrity=False): # index name will be reset index = Index([other.name], name=self.index.name) - combined_columns = self.columns.tolist() + self.columns.union( - other.index).difference(self.columns).tolist() + idx_diff = other.index.difference(self.columns) + try: + combined_columns = self.columns.append(idx_diff) + except TypeError: + combined_columns = self.columns.astype(object).append(idx_diff) other = other.reindex(combined_columns, copy=False) other = DataFrame(other.values.reshape((1, len(other))), index=index, diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ffd37dc4b2f59..640d09f3587fb 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1,5 +1,7 @@ from warnings import catch_warnings +from itertools import combinations, product +import datetime as dt import dateutil import numpy as np from numpy.random import randn @@ -829,12 +831,102 @@ def test_append_preserve_index_name(self): result = df1.append(df2) assert result.index.name == 'A' + indexes_can_append = [ + pd.RangeIndex(3), + pd.Index([4, 5, 6]), + pd.Index([4.5, 5.5, 6.5]), + pd.Index(list('abc')), + pd.CategoricalIndex('A B C'.split()), + pd.CategoricalIndex('D E F'.split(), ordered=True), + pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 3, 7, 12)]), + ] + + indexes_cannot_append_with_other = [ + pd.IntervalIndex.from_breaks([0, 1, 2, 3]), + pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), + ] + + all_indexes = indexes_can_append + indexes_cannot_append_with_other + + @pytest.mark.parametrize("index", + all_indexes, + ids=lambda x: x.__class__.__name__) + def test_append_same_columns_type(self, index): + # GH18359 + + # df wider than ser + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) + ser_index = index[:2] + ser = pd.Series([7, 8], index=ser_index, name=2) + result = df.append(ser) + expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]], + index=[0, 1, 2], + columns=index) + assert_frame_equal(result, expected) + + # ser wider than df + ser_index = index + index = index[:2] + df = pd.DataFrame([[1, 2], [4, 5]], columns=index) + ser = pd.Series([7, 8, 9], index=ser_index, name=2) + result = df.append(ser) + expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], + index=[0, 1, 2], + columns=ser_index) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("df_columns, series_index", + combinations(indexes_can_append, r=2), + ids=lambda x: x.__class__.__name__) + def test_append_different_columns_types(self, df_columns, series_index): + # GH18359 + # See also test 'test_append_different_columns_types_raises' below + # for errors raised when appending + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) + ser = pd.Series([7, 8, 9], index=series_index, name=2) + + result = df.append(ser) + idx_diff = ser.index.difference(df_columns) + combined_columns = Index(df_columns.tolist()).append(idx_diff) + expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan], + [4, 5, 6, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 7, 8, 9]], + index=[0, 1, 2], + columns=combined_columns) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "index_can_append, index_cannot_append_with_other", + product(indexes_can_append, indexes_cannot_append_with_other), + ids=lambda x: x.__class__.__name__) + def test_append_different_columns_types_raises( + self, index_can_append, index_cannot_append_with_other): + # GH18359 + # Dataframe.append will raise if IntervalIndex/MultiIndex appends + # or is appended to a different index type + # + # See also test 'test_append_different_columns_types' above for + # appending without raising. + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) + ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, + name=2) + with pytest.raises(TypeError): + df.append(ser) + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], + columns=index_cannot_append_with_other) + ser = pd.Series([7, 8, 9], index=index_can_append, name=2) + with pytest.raises(TypeError): + df.append(ser) + def test_append_dtype_coerce(self): # GH 4993 # appending with datetime will incorrectly convert datetime64 - import datetime as dt - from pandas import NaT df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], @@ -845,7 +937,9 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 4, 7, 10)]], columns=['start_time', 'end_time']) - expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), + expected = concat([Series([pd.NaT, + pd.NaT, + dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 4, 7, 10)], name='end_time'), Series([dt.datetime(2013, 1, 1, 0, 0), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 92bedbabdf2f1..1004b40bfb4c1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1540,12 +1540,14 @@ def test_crosstab_normalize(self): index=pd.Index([1, 2, 'All'], name='a', dtype='object'), - columns=pd.Index([3, 4], name='b')) + columns=pd.Index([3, 4], name='b', + dtype='object')) col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]], index=pd.Index([1, 2], name='a', dtype='object'), columns=pd.Index([3, 4, 'All'], - name='b')) + name='b', + dtype='object')) all_normal_margins = pd.DataFrame([[0.2, 0, 0.2], [0.2, 0.6, 0.8], @@ -1554,7 +1556,8 @@ def test_crosstab_normalize(self): name='a', dtype='object'), columns=pd.Index([3, 4, 'All'], - name='b')) + name='b', + dtype='object')) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', margins=True), row_normal_margins) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', From be057a17935f25d0a0d7d194e1b51e843a7a8d5e Mon Sep 17 00:00:00 2001 From: Haochen Wu Date: Fri, 20 Apr 2018 03:12:36 -0700 Subject: [PATCH 19/33] DOC: Clean up badges in README (#20749) --- README.md | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 36323410854b0..78e9b93ae535f 100644 --- a/README.md +++ b/README.md @@ -9,18 +9,33 @@ - + - + - + + - + @@ -50,20 +65,12 @@   - - - - - + - - + +
Latest Releaselatest release + + latest release + +
latest release + + latest release + +
Package Statusstatus + + status
Licenselicense + + license + +
Build StatusCoverage -   coverage -     -
Conda - - conda default downloads + coverage
Conda-forgeDownloads conda-forge downloads @@ -71,16 +78,16 @@
PyPI - - pypi downloads - - Gitter + + +
-[![https://gitter.im/pydata/pandas](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + ## What is it From 3a2e9e6c201fee07c3417550d2d47dca74066c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Heikkil=C3=A4?= Date: Fri, 20 Apr 2018 13:39:03 +0300 Subject: [PATCH 20/33] BUG: fixes indexing with monotonic decreasing DTI (#19362) (#20677) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexes/datetimelike.py | 3 ++- .../indexes/datetimes/test_partial_slicing.py | 21 +++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e9a4ec9328a9b..bcc442189bf11 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1113,6 +1113,7 @@ Indexing - Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` in presence of entire rows of NaNs in the middle of values (:issue:`20499`). - Bug in :class:`IntervalIndex` where some indexing operations were not supported for overlapping or non-monotonic ``uint64`` data (:issue:`20636`) - Bug in ``Series.is_unique`` where extraneous output in stderr is shown if Series contains objects with ``__ne__`` defined (:issue:`20661`) +- Bug in partial string indexing on a ``Series/DataFrame`` with a monotonic decreasing ``DatetimeIndex`` (:issue:`19362`) MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 95e1f8438c704..95186b2e79a16 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -342,7 +342,8 @@ def _format_with_header(self, header, **kwargs): def __contains__(self, key): try: res = self.get_loc(key) - return is_scalar(res) or type(res) == slice or np.any(res) + return (is_scalar(res) or isinstance(res, slice) or + (is_list_like(res) and len(res))) except (KeyError, TypeError, ValueError): return False diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index f263ac78cd343..4580d9fff31d5 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -91,6 +91,27 @@ def test_slice_duplicate_monotonic(self): expected = Timestamp('2017-01-01') assert result == expected + def test_monotone_DTI_indexing_bug(self): + # GH 19362 + # Testing accessing the first element in a montononic descending + # partial string indexing. + + df = pd.DataFrame(list(range(5))) + date_list = ['2018-01-02', '2017-02-10', '2016-03-10', + '2015-03-15', '2014-03-16'] + date_index = pd.to_datetime(date_list) + df['date'] = date_index + expected = pd.DataFrame({0: list(range(5)), 'date': date_index}) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame({'A': [1, 2, 3]}, + index=pd.date_range('20170101', + periods=3)[::-1]) + expected = pd.DataFrame({'A': 1}, + index=pd.date_range('20170103', + periods=1)) + tm.assert_frame_equal(df.loc['2017-01-03'], expected) + def test_slice_year(self): dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500) From 23bc21763d1938ad0febe22bdd5e940d9383ef31 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 21 Apr 2018 11:30:40 -0500 Subject: [PATCH 21/33] DOC: Various EA docs (#20707) --- doc/source/extending.rst | 25 +++++++++++++++++++++++++ doc/source/install.rst | 2 +- doc/source/whatsnew/v0.23.0.txt | 6 +++--- pandas/core/arrays/base.py | 27 +++++++++++++++++++++------ 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/doc/source/extending.rst b/doc/source/extending.rst index 25c4ba4a4a2a3..b94a43480ed93 100644 --- a/doc/source/extending.rst +++ b/doc/source/extending.rst @@ -57,6 +57,13 @@ If you write a custom accessor, make a pull request adding it to our Extension Types --------------- +.. versionadded:: 0.23.0 + +.. warning:: + + The ``ExtensionDtype`` and ``ExtensionArray`` APIs are new and + experimental. They may change between versions without warning. + Pandas defines an interface for implementing data types and arrays that *extend* NumPy's type system. Pandas itself uses the extension system for some types that aren't built into NumPy (categorical, period, interval, datetime with @@ -106,6 +113,24 @@ by some other storage type, like Python lists. See the `extension array source`_ for the interface definition. The docstrings and comments contain guidance for properly implementing the interface. +We provide a test suite for ensuring that your extension arrays satisfy the expected +behavior. To use the test suite, you must provide several pytest fixtures and inherit +from the base test class. The required fixtures are found in +https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/conftest.py. + +To use a test, subclass it: + +.. code-block:: python + + from pandas.tests.extension import base + + class TestConstructors(base.BaseConstructorsTests): + pass + + +See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py +for a list of all the tests available. + .. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py .. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py diff --git a/doc/source/install.rst b/doc/source/install.rst index c46f78ed6b6f7..4713bbb78d633 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -15,7 +15,7 @@ Instructions for installing from source, `PyPI `__, `ActivePython `__, various Linux distributions, or a `development version `__ are also provided. -.. _install.dropping_27: +.. _install.dropping-27: Plan for dropping Python 2.7 ---------------------------- diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bcc442189bf11..cb96c7093c005 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -11,7 +11,7 @@ version. .. warning:: Starting January 1, 2019, pandas feature releases will support Python 3 only. - See :ref:`here ` for more. + See :ref:`install.dropping-27` for more. .. _whatsnew_0230.enhancements: @@ -335,8 +335,8 @@ Supplying a ``CategoricalDtype`` will make the categories in each column consist .. _whatsnew_023.enhancements.extension: -Extending Pandas with Custom Types -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Extending Pandas with Custom Types (Experimental) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas now supports storing array-like objects that aren't necessarily 1-D NumPy arrays as columns in a DataFrame or values in a Series. This allows third-party diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d49a0d799526a..97a764fa7dbe8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,4 +1,10 @@ -"""An interface for extending pandas with custom arrays.""" +"""An interface for extending pandas with custom arrays. + +.. warning:: + + This is an experimental API and subject to breaking changes + without warning. +""" import numpy as np from pandas.errors import AbstractMethodError @@ -14,12 +20,15 @@ class ExtensionArray(object): with a custom type and will not attempt to coerce them to objects. They may be stored directly inside a :class:`DataFrame` or :class:`Series`. + .. versionadded:: 0.23.0 + Notes ----- The interface includes the following abstract methods that must be implemented by subclasses: * _constructor_from_sequence + * _from_factorized * __getitem__ * __len__ * dtype @@ -30,11 +39,21 @@ class ExtensionArray(object): * _concat_same_type Some additional methods are available to satisfy pandas' internal, private - block API. + block API: * _can_hold_na * _formatting_values + Some methods require casting the ExtensionArray to an ndarray of Python + objects with ``self.astype(object)``, which may be expensive. When + performance is a concern, we highly recommend overriding the following + methods: + + * fillna + * unique + * factorize / _values_for_factorize + * argsort / _values_for_argsort + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is @@ -50,10 +69,6 @@ class ExtensionArray(object): by some other storage type, like Python lists. Pandas makes no assumptions on how the data are stored, just that it can be converted to a NumPy array. - - Extension arrays should be able to be constructed with instances of - the class, i.e. ``ExtensionArray(extension_array)`` should return - an instance, not error. """ # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. # Don't override this. From 54470f3bace02e8be513a1ab9fbbe0b2130da476 Mon Sep 17 00:00:00 2001 From: KOBAYASHI Ittoku <38392245+kittoku@users.noreply.github.com> Date: Sun, 22 Apr 2018 01:39:42 +0900 Subject: [PATCH 22/33] BUG: unexpected assign by a single-element list (GH19474) (#20732) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexing.py | 3 ++- pandas/tests/indexing/test_iloc.py | 16 ++++++++++++++++ pandas/tests/indexing/test_loc.py | 16 ++++++++++++++++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cb96c7093c005..1c9849730edd6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1113,6 +1113,7 @@ Indexing - Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` in presence of entire rows of NaNs in the middle of values (:issue:`20499`). - Bug in :class:`IntervalIndex` where some indexing operations were not supported for overlapping or non-monotonic ``uint64`` data (:issue:`20636`) - Bug in ``Series.is_unique`` where extraneous output in stderr is shown if Series contains objects with ``__ne__`` defined (:issue:`20661`) +- Bug in ``.loc`` assignment with a single-element list-like incorrectly assigns as a list (:issue:`19474`) - Bug in partial string indexing on a ``Series/DataFrame`` with a monotonic decreasing ``DatetimeIndex`` (:issue:`19362`) MultiIndex diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5240a4703c242..2eb52ecc6bcc7 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -532,7 +532,8 @@ def setter(item, v): def can_do_equal_len(): """ return True if we have an equal len settable """ - if not len(labels) == 1 or not np.iterable(value): + if (not len(labels) == 1 or not np.iterable(value) or + is_scalar(plane_indexer[0])): return False l = len(value) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index a5506abe8f355..f1178d44dbfe0 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -10,6 +10,7 @@ from pandas import Series, DataFrame, date_range, concat, isna from pandas.util import testing as tm from pandas.tests.indexing.common import Base +from pandas.api.types import is_scalar class TestiLoc(Base): @@ -526,6 +527,21 @@ def test_iloc_setitem_list_of_lists(self): B=[5, 6, 11, 13, 9])) tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( + 'indexer', [[0], slice(None, 1, None), np.array([0])]) + @pytest.mark.parametrize( + 'value', [['Z'], np.array(['Z'])]) + def test_iloc_setitem_with_scalar_index(self, indexer, value): + # GH #19474 + # assigning like "df.iloc[0, [0]] = ['Z']" should be evaluated + # elementwisely, not using "setter('A', ['Z'])". + + df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df.iloc[0, indexer] = value + result = df.iloc[0, 0] + + assert is_scalar(result) and result == 'Z' + def test_iloc_mask(self): # GH 3631, iloc with a mask (of a series) should raise diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 86a5a82441ee8..39f4d2b7bd395 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -11,6 +11,7 @@ from pandas import Series, DataFrame, Timestamp, date_range, MultiIndex, Index from pandas.util import testing as tm from pandas.tests.indexing.common import Base +from pandas.api.types import is_scalar class TestLoc(Base): @@ -555,6 +556,21 @@ def test_loc_setitem_frame_multiples(self): df.loc[2:4] = rhs tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( + 'indexer', [['A'], slice(None, 'A', None), np.array(['A'])]) + @pytest.mark.parametrize( + 'value', [['Z'], np.array(['Z'])]) + def test_loc_setitem_with_scalar_index(self, indexer, value): + # GH #19474 + # assigning like "df.loc[0, ['A']] = ['Z']" should be evaluated + # elementwisely, not using "setter('A', ['Z'])". + + df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df.loc[0, indexer] = value + result = df.loc[0, 'A'] + + assert is_scalar(result) and result == 'Z' + def test_loc_coerceion(self): # 12411 From 669d9b20689ea2a8eb10429382e88a8a9ecc5659 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 21 Apr 2018 18:23:13 +0100 Subject: [PATCH 23/33] Add interpolate to doc string (#20776) --- pandas/core/generic.py | 1 + pandas/core/resample.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d3ab7afc025c9..2b683d3a606b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5289,6 +5289,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, See Also -------- + interpolate : Fill NaN values using interpolation. reindex, asfreq Returns diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0d0023b9f67d3..f8d283e932f44 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -659,6 +659,7 @@ def fillna(self, method, limit=None): pad : Forward fill NaN values in the resampled data. nearest : Fill NaN values in the resampled data with nearest neighbor starting from center. + interpolate : Fill NaN values using interpolation. pandas.Series.fillna : Fill NaN values in the Series using the specified method, which can be 'bfill' and 'ffill'. pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the From 336fba7c0191444c3328009e6d4f9f5d00ee224b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 21 Apr 2018 13:42:57 -0400 Subject: [PATCH 24/33] TST: #20720 xfail problematic s3 / moto tests --- pandas/tests/io/parser/test_network.py | 1 + pandas/tests/io/test_parquet.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index fdf45f307e953..416535aac3a4c 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -183,6 +183,7 @@ def test_read_csv_handles_boto_s3_object(self, expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(reason="buggy s3 / moto interaction on CI: gh-20720") def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks df = DataFrame(np.random.randn(100000, 4), columns=list('abcd')) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 11cbea8ce6331..41a1db57c954b 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -432,6 +432,7 @@ def test_categorical_unsupported(self, pa_lt_070): df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) self.check_error_on_write(df, pa, NotImplementedError) + @pytest.mark.xfail(reason="buggy s3 / moto interaction on CI: gh-20720") def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 check_round_trip(df_compat, pa, @@ -498,6 +499,7 @@ def test_filter_row_groups(self, fp): result = read_parquet(path, fp, filters=[('a', '==', 0)]) assert len(result) == 1 + @pytest.mark.xfail(reason="buggy s3 / moto interaction on CI: gh-20720") def test_s3_roundtrip(self, df_compat, s3_resource, fp): # GH #19134 check_round_trip(df_compat, fp, From 7e75e4ae7cc8a693ca25f7bfe255574b8a91fa03 Mon Sep 17 00:00:00 2001 From: Maximiliano Greco Date: Sat, 21 Apr 2018 20:14:14 +0200 Subject: [PATCH 25/33] Fixed WOM offset when n=0 (#20549) --- doc/source/api.rst | 2 ++ doc/source/whatsnew/v0.23.0.txt | 3 ++- pandas/tests/indexes/datetimes/test_date_range.py | 6 ++++++ pandas/tests/tseries/offsets/test_offsets.py | 15 +++++++++++++-- pandas/tseries/offsets.py | 3 --- 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index e224e9927f55c..e43632ea46bfb 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2106,6 +2106,7 @@ Standard moving window functions Rolling.skew Rolling.kurt Rolling.apply + Rolling.aggregate Rolling.quantile Window.mean Window.sum @@ -2133,6 +2134,7 @@ Standard expanding window functions Expanding.skew Expanding.kurt Expanding.apply + Expanding.aggregate Expanding.quantile Exponentially-weighted moving window functions diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1c9849730edd6..e340acc17fe9f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -438,6 +438,7 @@ Other Enhancements ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) - zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) +- :class:`WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`). - :class:`DataFrame` and :class:`Series` now support matrix multiplication (```@```) operator (:issue:`10259`) for Python>=3.5 - Updated ``to_gbq`` and ``read_gbq`` signature and documentation to reflect changes from the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ @@ -847,7 +848,7 @@ Other API Changes - :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`) - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`). - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`) -- A user-defined-function that is passed to :func:`Series.rolling().aggregate() `, :func:`DataFrame.rolling().aggregate() `, or its expanding cousins, will now *always* be passed a ``Series``, rather than an ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`) +- A user-defined-function that is passed to :func:`Series.rolling().aggregate() `, :func:`DataFrame.rolling().aggregate() `, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`) .. _whatsnew_0230.deprecations: diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 2dfd4ae3e6e3a..e5291ed52a86c 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -236,6 +236,12 @@ def test_catch_infinite_loop(self): pytest.raises(Exception, date_range, datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) + @pytest.mark.parametrize('periods', (1, 2)) + def test_wom_len(self, periods): + # https://github.com/pandas-dev/pandas/issues/20517 + res = date_range(start='20110101', periods=periods, freq='WOM-1MON') + assert len(res) == periods + class TestGenRangeGeneration(object): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index d96ebab615d12..5369b1a94a956 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -2228,8 +2228,6 @@ class TestWeekOfMonth(Base): _offset = WeekOfMonth def test_constructor(self): - tm.assert_raises_regex(ValueError, "^N cannot be 0", - WeekOfMonth, n=0, week=1, weekday=1) tm.assert_raises_regex(ValueError, "^Week", WeekOfMonth, n=1, week=4, weekday=0) tm.assert_raises_regex(ValueError, "^Week", WeekOfMonth, @@ -2261,6 +2259,19 @@ def test_offset(self): (-1, 2, 1, date3, datetime(2010, 12, 21)), (-1, 2, 1, date4, datetime(2011, 1, 18)), + (0, 0, 1, date1, datetime(2011, 1, 4)), + (0, 0, 1, date2, datetime(2011, 2, 1)), + (0, 0, 1, date3, datetime(2011, 2, 1)), + (0, 0, 1, date4, datetime(2011, 2, 1)), + (0, 1, 1, date1, datetime(2011, 1, 11)), + (0, 1, 1, date2, datetime(2011, 1, 11)), + (0, 1, 1, date3, datetime(2011, 2, 8)), + (0, 1, 1, date4, datetime(2011, 2, 8)), + (0, 0, 1, date1, datetime(2011, 1, 4)), + (0, 1, 1, date2, datetime(2011, 1, 11)), + (0, 2, 1, date3, datetime(2011, 1, 18)), + (0, 3, 1, date4, datetime(2011, 1, 25)), + (1, 0, 0, date1, datetime(2011, 2, 7)), (1, 0, 0, date2, datetime(2011, 2, 7)), (1, 0, 0, date3, datetime(2011, 2, 7)), diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 2e4be7fbdeebf..749165f894819 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1461,9 +1461,6 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): self.weekday = weekday self.week = week - if self.n == 0: - raise ValueError('N cannot be 0') - if self.weekday < 0 or self.weekday > 6: raise ValueError('Day must be 0<=day<=6, got {day}' .format(day=self.weekday)) From 0d199e4e8bb2b9ce73a35889b49d847283fadce5 Mon Sep 17 00:00:00 2001 From: Wenhuan Date: Sun, 22 Apr 2018 02:23:54 +0800 Subject: [PATCH 26/33] BUG: Fix problems in group rank when both nans and infinity are present #20561 (#20681) --- doc/source/whatsnew/v0.23.0.txt | 8 +++-- pandas/_libs/groupby_helper.pxi.in | 31 +++++++++++------- pandas/tests/groupby/test_groupby.py | 49 ++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e340acc17fe9f..e19aedac80213 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -221,6 +221,12 @@ Current Behavior: s.rank(na_option='top') +These bugs were squashed: + +- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`) +- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``ascending='False'`` failed to return correct ranks for infinity if ``NaN`` were present (:issue:`19538`) +- Bug in :func:`DataFrameGroupBy.rank` where ranks were incorrect when both infinity and ``NaN`` were present (:issue:`20561`) + .. _whatsnew_0230.enhancements.round-trippable_json: JSON read/write round-trippable with ``orient='table'`` @@ -1082,14 +1088,12 @@ Offsets Numeric ^^^^^^^ -- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`) - Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`) - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) - Bug in :class:`DataFrame` flex arithmetic (e.g. ``df.add(other, fill_value=foo)``) with a ``fill_value`` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) - Multiplication and division of numeric-dtyped :class:`Index` objects with timedelta-like scalars returns ``TimedeltaIndex`` instead of raising ``TypeError`` (:issue:`19333`) -- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``ascending='False'`` failed to return correct ranks for infinity if ``NaN`` were present (:issue:`19538`) - Bug where ``NaN`` was returned instead of 0 by :func:`Series.pct_change` and :func:`DataFrame.pct_change` when ``fill_method`` is not ``None`` (:issue:`19873`) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index de802f4a72277..6a33e4a09476d 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -417,7 +417,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ndarray[int64_t] labels, bint is_datetimelike, object ties_method, bint ascending, bint pct, object na_option): - """Provides the rank of values within each group + """ + Provides the rank of values within each group. Parameters ---------- @@ -425,17 +426,24 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, values : array of {{c_type}} values to be ranked labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` - is_datetimelike : bool + is_datetimelike : bool, default False unused in this method but provided for call compatibility with other Cython transformations - ties_method : {'keep', 'top', 'bottom'} + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' + pct : boolean, default False + Compute percentage rank of data within each group + na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending - ascending : boolean - False for ranks by high (1) to low (N) - pct : boolean - Compute percentage rank of data within each group Notes ----- @@ -508,7 +516,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and masked_vals[_as[i]] == nan_fill_val: + + if keep_na and mask[_as[i]]: grp_na_count += 1 out[_as[i], 0] = nan else: @@ -548,9 +557,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # reset the dups and sum_ranks, knowing that a new value is coming # up. the conditional also needs to handle nan equality and the # end of iteration - if (i == N - 1 or ( - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not - (mask[_as[i]] and mask[_as[i+1]]))): + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): dups = sum_ranks = 0 val_start = i grp_vals_seen += 1 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cdb4e3072c65d..c3400b6b710e5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1965,6 +1965,55 @@ def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): exp_df = DataFrame(exp * len(grps), columns=['val']) assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], + ]) + @pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ + ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), + ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), + ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), + ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), + ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), + ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), + ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), + ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), + ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), + ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), + ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), + ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), + ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), + ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), + ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), + ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), + ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), + ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), + ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), + ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), + ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), + ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), + ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), + ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), + ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), + ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), + ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), + ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) + ]) + def test_infs_n_nans(self, grps, vals, ties_method, ascending, na_option, + exp): + # GH 20561 + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option) + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("grps", [ ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ From 8def64931af8a01f4af50d79a8d628fe3e63f00c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 21 Apr 2018 18:26:27 -0400 Subject: [PATCH 27/33] TST: split test_groupby.py (#20781) closes #20696 --- .../tests/groupby/aggregate/test_aggregate.py | 71 +- pandas/tests/groupby/common.py | 62 - pandas/tests/groupby/conftest.py | 77 + pandas/tests/groupby/test_apply.py | 517 ++ pandas/tests/groupby/test_categorical.py | 1415 ++--- pandas/tests/groupby/test_filters.py | 1180 ++--- pandas/tests/groupby/test_function.py | 1120 ++++ pandas/tests/groupby/test_functional.py | 372 -- pandas/tests/groupby/test_groupby.py | 4606 ++++++----------- pandas/tests/groupby/test_grouping.py | 115 +- pandas/tests/groupby/test_nth.py | 618 +-- pandas/tests/groupby/test_rank.py | 254 + pandas/tests/groupby/test_transform.py | 1464 +++--- 13 files changed, 5983 insertions(+), 5888 deletions(-) delete mode 100644 pandas/tests/groupby/common.py create mode 100644 pandas/tests/groupby/conftest.py create mode 100644 pandas/tests/groupby/test_apply.py create mode 100644 pandas/tests/groupby/test_function.py delete mode 100644 pandas/tests/groupby/test_functional.py create mode 100644 pandas/tests/groupby/test_rank.py diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d85719d328ff2..b2f18e11de8ee 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -15,51 +15,6 @@ import pandas.util.testing as tm -@pytest.fixture -def ts(): - return tm.makeTimeSeries() - - -@pytest.fixture -def tsframe(): - return DataFrame(tm.getTimeSeriesData()) - - -@pytest.fixture -def df(): - return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - -@pytest.fixture -def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), - index=index, - columns=['A', 'B', 'C']) - - -@pytest.fixture -def three_group(): - return DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', - 'bar', 'bar', 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', - 'one', 'two', 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', - 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def test_agg_regression1(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.mean) @@ -87,6 +42,32 @@ def test_agg_ser_multi_key(df): tm.assert_series_equal(results, expected) +def test_groupby_aggregation_mixed_dtype(): + + # GH 6212 + expected = DataFrame({ + 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1], + 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]}, + index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99), + ('big', 'damp'), + ('blue', 'dry'), + ('red', 'red'), ('red', 'wet')], + names=['by1', 'by2'])) + + df = DataFrame({ + 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, + 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, + np.nan, np.nan] + }) + + g = df.groupby(['by1', 'by2']) + result = g[['v1', 'v2']].mean() + tm.assert_frame_equal(result, expected) + + def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA grouped = ts.groupby(ts * np.nan) diff --git a/pandas/tests/groupby/common.py b/pandas/tests/groupby/common.py deleted file mode 100644 index 3e99e8211b4f8..0000000000000 --- a/pandas/tests/groupby/common.py +++ /dev/null @@ -1,62 +0,0 @@ -""" Base setup """ - -import pytest -import numpy as np -from pandas.util import testing as tm -from pandas import DataFrame, MultiIndex - - -@pytest.fixture -def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - -@pytest.fixture -def df(): - return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - -class MixIn(object): - - def setup_method(self, method): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = df() - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) - - self.mframe = mframe() - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - -def assert_fp_equal(a, b): - assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py new file mode 100644 index 0000000000000..877aa835ac6f5 --- /dev/null +++ b/pandas/tests/groupby/conftest.py @@ -0,0 +1,77 @@ +import pytest +import numpy as np +from pandas import MultiIndex, DataFrame +from pandas.util import testing as tm + + +@pytest.fixture +def mframe(): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + return DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + +@pytest.fixture +def df(): + return DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + +@pytest.fixture +def ts(): + return tm.makeTimeSeries() + + +@pytest.fixture +def seriesd(): + return tm.getSeriesData() + + +@pytest.fixture +def tsd(): + return tm.getTimeSeriesData() + + +@pytest.fixture +def frame(seriesd): + return DataFrame(seriesd) + + +@pytest.fixture +def tsframe(tsd): + return DataFrame(tsd) + + +@pytest.fixture +def df_mixed_floats(): + return DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + +@pytest.fixture +def three_group(): + return DataFrame({'A': ['foo', 'foo', 'foo', + 'foo', 'bar', 'bar', + 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', + 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', + 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py new file mode 100644 index 0000000000000..5ca10fe1af9d1 --- /dev/null +++ b/pandas/tests/groupby/test_apply.py @@ -0,0 +1,517 @@ +import pytest +import numpy as np +import pandas as pd +from datetime import datetime +from pandas.util import testing as tm +from pandas import DataFrame, MultiIndex, compat, Series, bdate_range, Index + + +def test_apply_issues(): + # GH 5788 + + s = """2011.05.16,00:00,1.40893 +2011.05.16,01:00,1.40760 +2011.05.16,02:00,1.40750 +2011.05.16,03:00,1.40649 +2011.05.17,02:00,1.40893 +2011.05.17,03:00,1.40760 +2011.05.17,04:00,1.40750 +2011.05.17,05:00,1.40649 +2011.05.18,02:00,1.40893 +2011.05.18,03:00,1.40760 +2011.05.18,04:00,1.40750 +2011.05.18,05:00,1.40649""" + + df = pd.read_csv( + compat.StringIO(s), header=None, names=['date', 'time', 'value'], + parse_dates=[['date', 'time']]) + df = df.set_index('date_time') + + expected = df.groupby(df.index.date).idxmax() + result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) + tm.assert_frame_equal(result, expected) + + # GH 5789 + # don't auto coerce dates + df = pd.read_csv( + compat.StringIO(s), header=None, names=['date', 'time', 'value']) + exp_idx = pd.Index( + ['2011.05.16', '2011.05.17', '2011.05.18' + ], dtype=object, name='date') + expected = Series(['00:00', '02:00', '02:00'], index=exp_idx) + result = df.groupby('date').apply( + lambda x: x['time'][x['value'].idxmax()]) + tm.assert_series_equal(result, expected) + + +def test_apply_trivial(): + # GH 20066 + # trivial apply: ignore input and return a constant dataframe. + df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], + 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=['key', 'data']) + expected = pd.concat([df.iloc[1:], df.iloc[1:]], + axis=1, keys=['float64', 'object']) + result = df.groupby([str(x) for x in df.dtypes], + axis=1).apply(lambda x: df.iloc[1:]) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason=("GH 20066; function passed into apply " + "returns a DataFrame with the same index " + "as the one to create GroupBy object.")) +def test_apply_trivial_fail(): + # GH 20066 + # trivial apply fails if the constant dataframe has the same index + # with the one used to create GroupBy object. + df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], + 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=['key', 'data']) + expected = pd.concat([df, df], + axis=1, keys=['float64', 'object']) + result = df.groupby([str(x) for x in df.dtypes], + axis=1).apply(lambda x: df) + + tm.assert_frame_equal(result, expected) + + +def test_fast_apply(): + # make sure that fast apply is correctly called + # rather than raising any kind of error + # otherwise the python path will be callsed + # which slows things down + N = 1000 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + df = DataFrame({'key': labels, + 'key2': labels2, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + + def f(g): + return 1 + + g = df.groupby(['key', 'key2']) + + grouper = g.grouper + + splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) + group_keys = grouper._get_group_keys() + + values, mutated = splitter.fast_apply(f, group_keys) + assert not mutated + + +def test_apply_with_mixed_dtype(): + # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 + df = DataFrame({'foo1': np.random.randn(6), + 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) + result = df.apply(lambda x: x, axis=1) + tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) + + # GH 3610 incorrect dtype conversion with as_index=False + df = DataFrame({"c1": [1, 2, 6, 6, 8]}) + df["c2"] = df.c1 / 2.0 + result1 = df.groupby("c2").mean().reset_index().c2 + result2 = df.groupby("c2", as_index=False).mean().c2 + tm.assert_series_equal(result1, result2) + + +def test_groupby_as_index_apply(df): + # GH #4648 and #3417 + df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], + 'user_id': [1, 2, 1, 1, 3, 1], + 'time': range(6)}) + + g_as = df.groupby('user_id', as_index=True) + g_not_as = df.groupby('user_id', as_index=False) + + res_as = g_as.head(2).index + res_not_as = g_not_as.head(2).index + exp = Index([0, 1, 2, 4]) + tm.assert_index_equal(res_as, exp) + tm.assert_index_equal(res_not_as, exp) + + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + + # apply doesn't maintain the original ordering + # changed in GH5610 as the as_index=False returns a MI here + exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( + 2, 4)]) + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) + + tm.assert_index_equal(res_as_apply, exp_as_apply) + tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + + ind = Index(list('abcde')) + df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) + res = df.groupby(0, as_index=False).apply(lambda x: x).index + tm.assert_index_equal(res, ind) + + +def test_apply_concat_preserve_names(three_group): + grouped = three_group.groupby(['A', 'B']) + + def desc(group): + result = group.describe() + result.index.name = 'stat' + return result + + def desc2(group): + result = group.describe() + result.index.name = 'stat' + result = result[:len(group)] + # weirdo + return result + + def desc3(group): + result = group.describe() + + # names are different + result.index.name = 'stat_%d' % len(group) + + result = result[:len(group)] + # weirdo + return result + + result = grouped.apply(desc) + assert result.index.names == ('A', 'B', 'stat') + + result2 = grouped.apply(desc2) + assert result2.index.names == ('A', 'B', 'stat') + + result3 = grouped.apply(desc3) + assert result3.index.names == ('A', 'B', None) + + +def test_apply_series_to_frame(): + def f(piece): + with np.errstate(invalid='ignore'): + logged = np.log(piece) + return DataFrame({'value': piece, + 'demeaned': piece - piece.mean(), + 'logged': logged}) + + dr = bdate_range('1/1/2000', periods=100) + ts = Series(np.random.randn(100), index=dr) + + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(f) + + assert isinstance(result, DataFrame) + tm.assert_index_equal(result.index, ts.index) + + +def test_apply_series_yield_constant(df): + result = df.groupby(['A', 'B'])['C'].apply(len) + assert result.index.names[:2] == ('A', 'B') + + +def test_apply_frame_yield_constant(df): + # GH13568 + result = df.groupby(['A', 'B']).apply(len) + assert isinstance(result, Series) + assert result.name is None + + result = df.groupby(['A', 'B'])[['C', 'D']].apply(len) + assert isinstance(result, Series) + assert result.name is None + + +def test_apply_frame_to_series(df): + grouped = df.groupby(['A', 'B']) + result = grouped.apply(len) + expected = grouped.count()['C'] + tm.assert_index_equal(result.index, expected.index) + tm.assert_numpy_array_equal(result.values, expected.values) + + +def test_apply_frame_concat_series(): + def trans(group): + return group.groupby('B')['C'].sum().sort_values()[:2] + + def trans2(group): + grouped = group.groupby(df.reindex(group.index)['B']) + return grouped.sum().sort_values()[:2] + + df = DataFrame({'A': np.random.randint(0, 5, 1000), + 'B': np.random.randint(0, 5, 1000), + 'C': np.random.randn(1000)}) + + result = df.groupby('A').apply(trans) + exp = df.groupby('A')['C'].apply(trans2) + tm.assert_series_equal(result, exp, check_names=False) + assert result.name == 'C' + + +def test_apply_transform(ts): + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + tm.assert_series_equal(result, expected) + + +def test_apply_multikey_corner(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + + def f(group): + return group.sort_values('A')[-5:] + + result = grouped.apply(f) + for key, group in grouped: + tm.assert_frame_equal(result.loc[key], f(group)) + + +def test_apply_chunk_view(): + # Low level tinkering could be unsafe, make sure not + df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'value': compat.lrange(9)}) + + # return view + f = lambda x: x[:2] + + result = df.groupby('key', group_keys=False).apply(f) + expected = df.take([0, 1, 3, 4, 6, 7]) + tm.assert_frame_equal(result, expected) + + +def test_apply_no_name_column_conflict(): + df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], + 'value': compat.lrange(10)[::-1]}) + + # it works! #2605 + grouped = df.groupby(['name', 'name2']) + grouped.apply(lambda x: x.sort_values('value', inplace=True)) + + +def test_apply_typecast_fail(): + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile( + ['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + tm.assert_frame_equal(result, expected) + + +def test_apply_multiindex_fail(): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] + ]) + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}, index=index) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + tm.assert_frame_equal(result, expected) + + +def test_apply_corner(tsframe): + result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + expected = tsframe * 2 + tm.assert_frame_equal(result, expected) + + +def test_apply_without_copy(): + # GH 5545 + # returning a non-copy in an applied function fails + + data = DataFrame({'id_field': [100, 100, 200, 300], + 'category': ['a', 'b', 'c', 'c'], + 'value': [1, 2, 3, 4]}) + + def filt1(x): + if x.shape[0] == 1: + return x.copy() + else: + return x[x.category == 'c'] + + def filt2(x): + if x.shape[0] == 1: + return x + else: + return x[x.category == 'c'] + + expected = data.groupby('id_field').apply(filt1) + result = data.groupby('id_field').apply(filt2) + tm.assert_frame_equal(result, expected) + + +def test_apply_corner_cases(): + # #535, can't use sliding iterator + + N = 1000 + labels = np.random.randint(0, 100, size=N) + df = DataFrame({'key': labels, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + + grouped = df.groupby('key') + + def f(g): + g['value3'] = g['value1'] * 2 + return g + + result = grouped.apply(f) + assert 'value3' in result + + +def test_apply_numeric_coercion_when_datetime(): + # In the past, group-by/apply operations have been over-eager + # in converting dtypes to numeric, in the presence of datetime + # columns. Various GH issues were filed, the reproductions + # for which are here. + + # GH 15670 + df = pd.DataFrame({'Number': [1, 2], + 'Date': ["2017-03-02"] * 2, + 'Str': ["foo", "inf"]}) + expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + df.Date = pd.to_datetime(df.Date) + result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result['Str'], expected['Str']) + + # GH 15421 + df = pd.DataFrame({'A': [10, 20, 30], + 'B': ['foo', '3', '4'], + 'T': [pd.Timestamp("12:31:22")] * 3}) + + def get_B(g): + return g.iloc[0][['B']] + result = df.groupby('A').apply(get_B)['B'] + expected = df.B + expected.index = df.A + tm.assert_series_equal(result, expected) + + # GH 14423 + def predictions(tool): + out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) + if 'step1' in list(tool.State): + out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) + if 'step2' in list(tool.State): + out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) + out['useTime'] = str( + tool[tool.State == 'step2'].oTime.values[0]) + return out + df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], + 'State': ['step1', 'step2', 'step1', 'step2'], + 'oTime': ['', '2016-09-19 05:24:33', + '', '2016-09-19 23:59:04'], + 'Machine': ['23', '36L', '36R', '36R']}) + df2 = df1.copy() + df2.oTime = pd.to_datetime(df2.oTime) + expected = df1.groupby('Key').apply(predictions).p1 + result = df2.groupby('Key').apply(predictions).p1 + tm.assert_series_equal(expected, result) + + +def test_time_field_bug(): + # Test a fix for the following error related to GH issue 11324 When + # non-key fields in a group-by dataframe contained time-based fields + # that were not returned by the apply function, an exception would be + # raised. + + df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]}) + + def func_with_no_date(batch): + return pd.Series({'c': 2}) + + def func_with_date(batch): + return pd.Series({'b': datetime(2015, 1, 1), 'c': 2}) + + dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date) + dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1]) + dfg_no_conversion_expected.index.name = 'a' + + dfg_conversion = df.groupby(by=['a']).apply(func_with_date) + dfg_conversion_expected = pd.DataFrame( + {'b': datetime(2015, 1, 1), + 'c': 2}, index=[1]) + dfg_conversion_expected.index.name = 'a' + + tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) + tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected) + + +def test_gb_apply_list_of_unequal_len_arrays(): + + # GH1738 + df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a', + 'b', 'b', 'b'], + 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd', + 'd', 'd', 'e'], + 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], + 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]}) + df = df.set_index(['group1', 'group2']) + df_grouped = df.groupby(level=['group1', 'group2'], sort=True) + + def noddy(value, weight): + out = np.array(value * weight).repeat(3) + return out + + # the kernel function returns arrays of unequal length + # pandas sniffs the first one, sees it's an array and not + # a list, and assumed the rest are of equal length + # and so tries a vstack + + # don't die + df_grouped.apply(lambda x: noddy(x.value, x.weight)) + + +def test_groupby_apply_all_none(): + # Tests to make sure no errors if apply function returns all None + # values. Issue 9684. + test_df = DataFrame({'groups': [0, 0, 1, 1], + 'random_vars': [8, 7, 4, 5]}) + + def test_func(x): + pass + + result = test_df.groupby('groups').apply(test_func) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + +def test_groupby_apply_none_first(): + # GH 12824. Tests if apply returns None first. + test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) + test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) + + def test_func(x): + if x.shape[0] < 2: + return None + return x.iloc[[0, -1]] + + result1 = test_df1.groupby('groups').apply(test_func) + result2 = test_df2.groupby('groups').apply(test_func) + index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], + names=['groups', None]) + index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], + names=['groups', None]) + expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, + index=index1) + expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, + index=index2) + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index bcd0da28b5a34..160b60e69f39d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -9,710 +9,725 @@ import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series, Interval) + DataFrame, Categorical, Series, Interval, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm -from .common import MixIn - - -class TestGroupByCategorical(MixIn): - - def test_groupby(self): - - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) - data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - - exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) - expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) - result = data.groupby("b").mean() - tm.assert_frame_equal(result, expected) - - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) - - # single grouper - gb = df.groupby("A") - exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers - gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, - np.nan, np.nan, np.nan]}, - index=exp_index) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers with a non-cat - df = df.copy() - df['C'] = ['foo', 'bar'] * 2 - gb = df.groupby(['A', 'B', 'C']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True), - ['foo', 'bar']], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - np.nan, index=exp_index)}).sort_index() - expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # GH 8623 - x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], - [1, 'John P. Doe']], - columns=['person_id', 'person_name']) - x['person_name'] = Categorical(x.person_name) - - g = x.groupby(['person_id']) - result = g.transform(lambda x: x) - tm.assert_frame_equal(result, x[['person_name']]) - - result = x.drop_duplicates('person_name') - expected = x.iloc[[0, 1]] - tm.assert_frame_equal(result, expected) - - def f(x): - return x.drop_duplicates('person_name').iloc[0] - - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() - expected.index = Index([1, 2], name='person_id') - expected['person_name'] = expected['person_name'].astype('object') - tm.assert_frame_equal(result, expected) - - # GH 9921 - # Monotonic - df = DataFrame({"a": [5, 15, 25]}) - c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) - - result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a']) - - tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) - tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) - - # Filter - tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) - tm.assert_frame_equal(df.groupby(c).filter(np.all), df) - - # Non-monotonic - df = DataFrame({"a": [5, 15, 25, -5]}) - c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) - - result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a']) - - tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) - tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) - - # GH 9603 - df = DataFrame({'a': [1, 0, 0, 0]}) - c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) - result = df.groupby(c).apply(len) - - exp_index = CategoricalIndex( - c.values.categories, ordered=c.values.ordered) - expected = Series([1, 0, 0, 0], index=exp_index) - expected.index.name = 'a' - tm.assert_series_equal(result, expected) - - def test_groupby_sort(self): - - # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby - # This should result in a properly sorted Series so that the plot - # has a sorted x axis - # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) - - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) - - res = df.groupby(['value_group'])['value_group'].count() - exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] - exp.index = CategoricalIndex(exp.index, name=exp.index.name) - tm.assert_series_equal(res, exp) - - def test_level_groupby_get_group(self): - # GH15155 - df = DataFrame(data=np.arange(2, 22, 2), - index=MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], - labels=[[0] * 5 + [1] * 5, range(10)], - names=["Index1", "Index2"])) - g = df.groupby(level=["Index1"]) - - # expected should equal test.loc[["a"]] - # GH15166 - expected = DataFrame(data=np.arange(2, 12, 2), - index=pd.MultiIndex(levels=[pd.CategoricalIndex( - ["a", "b"]), range(5)], - labels=[[0] * 5, range(5)], - names=["Index1", "Index2"])) - result = g.get_group('a') - assert_frame_equal(result, expected) - - def test_apply_use_categorical_name(self): - from pandas import qcut - cats = qcut(self.df.C, 4) - - def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} - - result = self.df.groupby(cats).D.apply(get_stats) - assert result.index.names[0] == 'C' - - def test_apply_categorical_data(self): - # GH 10138 - for ordered in [True, False]: - dense = Categorical(list('abc'), ordered=ordered) - # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) - values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense']) - - # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product( - [Categorical(['a', 'b'], ordered=ordered), - Categorical(['a', 'b', 'c'], ordered=ordered)], - names=['missing', 'dense']) - expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], - index=idx, - columns=['values']) - - assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) - assert_frame_equal(grouped.mean(), expected) - assert_frame_equal(grouped.agg(np.mean), expected) - - # but for transform we should still get back the original index - idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], - names=['missing', 'dense']) - expected = Series(1, index=idx) - assert_series_equal(grouped.apply(lambda x: 1), expected) - - def test_groupby_categorical(self): - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) - expected = expected.reindex(exp_idx) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = np.asarray(cats).take(idx) - ord_data = data.take(idx) - - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby(exp_cats, sort=False).describe() - assert_frame_equal(desc_result, expected) - - # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) - exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) - - def test_groupby_datetime_categorical(self): - # GH9049: ensure backward compatibility - levels = pd.date_range('2014-01-01', periods=4) - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - expected = expected.reindex(levels) - expected.index = CategoricalIndex(expected.index, - categories=expected.index, - ordered=True) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = cats.take_nd(idx) - ord_data = data.take(idx) - expected = ord_data.groupby(ord_labels).describe() - assert_frame_equal(desc_result, expected) - tm.assert_index_equal(desc_result.index, expected.index) - tm.assert_index_equal( - desc_result.index.get_level_values(0), - expected.index.get_level_values(0)) - - # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) - exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) - - def test_groupby_categorical_index(self): - - s = np.random.RandomState(12345) - levels = ['foo', 'bar', 'baz', 'qux'] - codes = s.randint(0, 4, size=20) - cats = Categorical.from_codes(codes, levels, ordered=True) - df = DataFrame( - np.repeat( - np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) - df['cats'] = cats - - # with a cat index - result = df.set_index('cats').groupby(level=0).sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - # with a cat column, should produce a cat index - result = df.groupby('cats').sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - - def test_groupby_describe_categorical_columns(self): - # GH 11558 - cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) - df = DataFrame(np.random.randn(20, 4), columns=cats) - result = df.groupby([1, 2, 3, 4] * 5).describe() - - tm.assert_index_equal(result.stack().columns, cats) - tm.assert_categorical_equal(result.stack().columns.values, cats.values) - - def test_groupby_unstack_categorical(self): - # GH11558 (example is taken from the original issue) - df = pd.DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) - df['medium'] = df['medium'].astype('category') - - gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() - result = gcat.describe() - - exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, - name='medium') - tm.assert_index_equal(result.columns, exp_columns) - tm.assert_categorical_equal(result.columns.values, exp_columns.values) - - result = gcat['A'] + gcat['B'] - expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) - tm.assert_series_equal(result, expected) - - def test_groupby_bins_unequal_len(self): - # GH3011 - series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) - bins = pd.cut(series.dropna().values, 4) - - # len(bins) != len(series) here - def f(): - series.groupby(bins).mean() - pytest.raises(ValueError, f) - - def test_groupby_multi_categorical_as_index(self): - # GH13204 - df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), - 'A': [10, 11, 11], - 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # function grouper - f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # another not in-axis grouper - s = Series(['a', 'b', 'b'], name='cat2') - result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # GH18872: conflicting names in desired index - pytest.raises(ValueError, lambda: df.groupby(['cat', - s.rename('cat')]).sum()) - - # is original index dropped? - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - - group_columns = ['cat', 'A'] - - for name in [None, 'X', 'B', 'cat']: - df.index = Index(list("abc"), name=name) - - if name in group_columns and name in df.index.names: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = df.groupby(group_columns, as_index=False).sum() - - else: +def test_groupby(): + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) + expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) + result = data.groupby("b").mean() + tm.assert_frame_equal(result, expected) + + raw_cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + raw_cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) + + # single grouper + gb = df.groupby("A") + exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) + expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # multiple groupers + gb = df.groupby(['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, + np.nan, np.nan, np.nan]}, + index=exp_index) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # multiple groupers with a non-cat + df = df.copy() + df['C'] = ['foo', 'bar'] * 2 + gb = df.groupby(['A', 'B', 'C']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True), + ['foo', 'bar']], + names=['A', 'B', 'C']) + expected = DataFrame({'values': Series( + np.nan, index=exp_index)}).sort_index() + expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # GH 8623 + x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], + [1, 'John P. Doe']], + columns=['person_id', 'person_name']) + x['person_name'] = Categorical(x.person_name) + + g = x.groupby(['person_id']) + result = g.transform(lambda x: x) + tm.assert_frame_equal(result, x[['person_name']]) + + result = x.drop_duplicates('person_name') + expected = x.iloc[[0, 1]] + tm.assert_frame_equal(result, expected) + + def f(x): + return x.drop_duplicates('person_name').iloc[0] + + result = g.apply(f) + expected = x.iloc[[0, 1]].copy() + expected.index = Index([1, 2], name='person_id') + expected['person_name'] = expected['person_name'].astype('object') + tm.assert_frame_equal(result, expected) + + # GH 9921 + # Monotonic + df = DataFrame({"a": [5, 15, 25]}) + c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) + + result = df.a.groupby(c).transform(sum) + tm.assert_series_equal(result, df['a']) + + tm.assert_series_equal( + df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal( + df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) + + # Filter + tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) + tm.assert_frame_equal(df.groupby(c).filter(np.all), df) + + # Non-monotonic + df = DataFrame({"a": [5, 15, 25, -5]}) + c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) + + result = df.a.groupby(c).transform(sum) + tm.assert_series_equal(result, df['a']) + + tm.assert_series_equal( + df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal( + df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) + + # GH 9603 + df = DataFrame({'a': [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) + result = df.groupby(c).apply(len) + + exp_index = CategoricalIndex( + c.values.categories, ordered=c.values.ordered) + expected = Series([1, 0, 0, 0], index=exp_index) + expected.index.name = 'a' + tm.assert_series_equal(result, expected) + + +def test_groupby_sort(): + + # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby + # This should result in a properly sorted Series so that the plot + # has a sorted x axis + # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + + df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=['value'], ascending=True) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), + right=False, labels=cat_labels) + + res = df.groupby(['value_group'])['value_group'].count() + exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] + exp.index = CategoricalIndex(exp.index, name=exp.index.name) + tm.assert_series_equal(res, exp) + + +def test_level_groupby_get_group(): + # GH15155 + df = DataFrame(data=np.arange(2, 22, 2), + index=MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + labels=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"])) + g = df.groupby(level=["Index1"]) + + # expected should equal test.loc[["a"]] + # GH15166 + expected = DataFrame(data=np.arange(2, 12, 2), + index=pd.MultiIndex(levels=[pd.CategoricalIndex( + ["a", "b"]), range(5)], + labels=[[0] * 5, range(5)], + names=["Index1", "Index2"])) + result = g.get_group('a') + + assert_frame_equal(result, expected) + + +def test_apply_use_categorical_name(df): + cats = qcut(df.C, 4) + + def get_stats(group): + return {'min': group.min(), + 'max': group.max(), + 'count': group.count(), + 'mean': group.mean()} + + result = df.groupby(cats).D.apply(get_stats) + assert result.index.names[0] == 'C' + + +def test_apply_categorical_data(): + # GH 10138 + for ordered in [True, False]: + dense = Categorical(list('abc'), ordered=ordered) + # 'b' is in the categories but not in the list + missing = Categorical( + list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_product( + [Categorical(['a', 'b'], ordered=ordered), + Categorical(['a', 'b', 'c'], ordered=ordered)], + names=['missing', 'dense']) + expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], + index=idx, + columns=['values']) + + assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) + assert_frame_equal(grouped.mean(), expected) + assert_frame_equal(grouped.agg(np.mean), expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = Series(1, index=idx) + assert_series_equal(grouped.apply(lambda x: 1), expected) + + +def test_groupby_categorical(): + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) + expected = expected.reindex(exp_idx) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby(exp_cats, sort=False).describe() + assert_frame_equal(desc_result, expected) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(0)), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(1)), exp) + + +def test_groupby_datetime_categorical(): + # GH9049: ensure backward compatibility + levels = pd.date_range('2014-01-01', periods=4) + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + expected = expected.reindex(levels) + expected.index = CategoricalIndex(expected.index, + categories=expected.index, + ordered=True) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = cats.take_nd(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels).describe() + assert_frame_equal(desc_result, expected) + tm.assert_index_equal(desc_result.index, expected.index) + tm.assert_index_equal( + desc_result.index.get_level_values(0), + expected.index.get_level_values(0)) + + # GH 10460 + expc = Categorical.from_codes( + np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(0)), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(1)), exp) + + +def test_groupby_categorical_index(): + + s = np.random.RandomState(12345) + levels = ['foo', 'bar', 'baz', 'qux'] + codes = s.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, ordered=True) + df = DataFrame( + np.repeat( + np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) + df['cats'] = cats + + # with a cat index + result = df.set_index('cats').groupby(level=0).sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby('cats').sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + +def test_groupby_describe_categorical_columns(): + # GH 11558 + cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) + df = DataFrame(np.random.randn(20, 4), columns=cats) + result = df.groupby([1, 2, 3, 4] * 5).describe() + + tm.assert_index_equal(result.stack().columns, cats) + tm.assert_categorical_equal(result.stack().columns.values, cats.values) + + +def test_groupby_unstack_categorical(): + # GH11558 (example is taken from the original issue) + df = pd.DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) + df['medium'] = df['medium'].astype('category') + + gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() + result = gcat.describe() + + exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, + name='medium') + tm.assert_index_equal(result.columns, exp_columns) + tm.assert_categorical_equal(result.columns.values, exp_columns.values) + + result = gcat['A'] + gcat['B'] + expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + tm.assert_series_equal(result, expected) + + +def test_groupby_bins_unequal_len(): + # GH3011 + series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) + bins = pd.cut(series.dropna().values, 4) + + # len(bins) != len(series) here + def f(): + series.groupby(bins).mean() + pytest.raises(ValueError, f) + + +def test_groupby_multi_categorical_as_index(): + # GH13204 + df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), + 'A': [10, 11, 11], + 'B': [101, 102, 103]}) + result = df.groupby(['cat', 'A'], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, 'A'] + result = df.groupby(['cat', f], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper + s = Series(['a', 'b', 'b'], name='cat2') + result = df.groupby(['cat', s], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # GH18872: conflicting names in desired index + pytest.raises(ValueError, lambda: df.groupby(['cat', + s.rename('cat')]).sum()) + + # is original index dropped? + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + + group_columns = ['cat', 'A'] + + for name in [None, 'X', 'B', 'cat']: + df.index = Index(list("abc"), name=name) + + if name in group_columns and name in df.index.names: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): result = df.groupby(group_columns, as_index=False).sum() - tm.assert_frame_equal(result, expected, check_index_type=True) - - def test_groupby_preserve_categories(self): - # GH-13179 - categories = list('abc') - - # ordered=True - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, + else: + result = df.groupby(group_columns, as_index=False).sum() + + tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_groupby_preserve_categories(): + # GH-13179 + categories = list('abc') + + # ordered=True + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = pd.CategoricalIndex(categories, categories, ordered=True) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) + + # ordered=False + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = pd.CategoricalIndex(categories, categories, ordered=False) + nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), + ordered=False) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, + sort_index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, + nosort_index) + + +def test_groupby_preserve_categorical_dtype(): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + # single grouper + exp_full = DataFrame({'A': [2.0, 1.0, np.nan], + 'B': [25.0, 20.0, np.nan], + 'C1': Categorical(list("bac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bac"), + categories=list("bac"), ordered=True)}) - index = pd.CategoricalIndex(categories, categories, ordered=True) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) - - # ordered=False - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=False)}) - sort_index = pd.CategoricalIndex(categories, categories, ordered=False) - nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), - ordered=False) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, - sort_index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, - nosort_index) - - def test_groupby_preserve_categorical_dtype(self): - # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) - # single grouper - exp_full = DataFrame({'A': [2.0, 1.0, np.nan], - 'B': [25.0, 20.0, np.nan], - 'C1': Categorical(list("bac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bac"), - categories=list("bac"), - ordered=True)}) - for col in ['C1', 'C2']: - result1 = df.groupby(by=col, as_index=False).mean() - result2 = df.groupby(by=col, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # multiple grouper - exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], - 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, - np.nan], - 'C1': Categorical(list("bacbac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bacbac"), - categories=list("bac"), - ordered=True)}) - for cols in [['A', 'C1'], ['A', 'C2']]: - result1 = df.groupby(by=cols, as_index=False).mean() - result2 = df.groupby(by=cols, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - def test_groupby_categorical_no_compress(self): - data = Series(np.random.randn(9)) - - codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean() - - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) - data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - - result = data.groupby("b").mean() - result = result["a"].values - exp = np.array([1, 2, 4, np.nan]) - tm.assert_numpy_array_equal(result, exp) - - def test_groupby_sort_categorical(self): - # dataframe groupby sort was being ignored # GH 8868 - df = DataFrame([['(7.5, 10]', 10, 10], - ['(7.5, 10]', 8, 20], - ['(2.5, 5]', 5, 30], - ['(5, 7.5]', 6, 40], - ['(2.5, 5]', 4, 50], - ['(0, 2.5]', 1, 60], - ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) - df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range', ordered=True) - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - col = 'range' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) - - df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range') - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', - '(0, 2.5]'], - categories=['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], - name='range') - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - index=index, columns=['foo', 'bar']) - - col = 'range' - # this is an unordered categorical, but we allow this #### - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - - def test_groupby_sort_categorical_datetimelike(self): - # GH10505 - - # use same data as test_groupby_sort_categorical, which category is - # corresponding to datetime.month - df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), - datetime(2011, 2, 1), datetime(2011, 5, 1), - datetime(2011, 2, 1), datetime(2011, 1, 1), - datetime(2011, 5, 1)], - 'foo': [10, 8, 5, 6, 4, 1, 7], - 'bar': [10, 20, 30, 40, 50, 60, 70]}, - columns=['dt', 'foo', 'bar']) - - # ordered=True - df['dt'] = Categorical(df['dt'], ordered=True) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt', ordered=True) - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt', ordered=True) - - col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) - - # ordered = False - df['dt'] = Categorical(df['dt'], ordered=False) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt') - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt') - - col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - - def test_groupby_categorical_two_columns(self): - - # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - - exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", - ordered=True) - exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=exp_index) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) - values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical([Interval(1, 2), Interval(2, 3), - Interval(3, 6)], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) - tm.assert_frame_equal(res, exp) - - def test_empty_sum(self): - # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - - # 0 by default - result = df.groupby("A").B.sum() - expected = pd.Series([3, 1, 0], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=0 - result = df.groupby("A").B.sum(min_count=0) - expected = pd.Series([3, 1, 0], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=1 - result = df.groupby("A").B.sum(min_count=1) - expected = pd.Series([3, 1, np.nan], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count>1 - result = df.groupby("A").B.sum(min_count=2) - expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - def test_empty_prod(self): - # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - - # 1 by default - result = df.groupby("A").B.prod() - expected = pd.Series([2, 1, 1], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=0 - result = df.groupby("A").B.prod(min_count=0) - expected = pd.Series([2, 1, 1], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=1 - result = df.groupby("A").B.prod(min_count=1) - expected = pd.Series([2, 1, np.nan], expected_idx, name='B') - tm.assert_series_equal(result, expected) + for col in ['C1', 'C2']: + result1 = df.groupby(by=col, as_index=False).mean() + result2 = df.groupby(by=col, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # multiple grouper + exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], + 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, + np.nan], + 'C1': Categorical(list("bacbac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bacbac"), + categories=list("bac"), + ordered=True)}) + for cols in [['A', 'C1'], ['A', 'C2']]: + result1 = df.groupby(by=cols, as_index=False).mean() + result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + +def test_groupby_categorical_no_compress(): + data = Series(np.random.randn(9)) + + codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean() + + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) + cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean().reindex(cats.categories) + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + result = data.groupby("b").mean() + result = result["a"].values + exp = np.array([1, 2, 4, np.nan]) + tm.assert_numpy_array_equal(result, exp) + + +def test_groupby_sort_categorical(): + # dataframe groupby sort was being ignored # GH 8868 + df = DataFrame([['(7.5, 10]', 10, 10], + ['(7.5, 10]', 8, 20], + ['(2.5, 5]', 5, 30], + ['(5, 7.5]', 6, 40], + ['(2.5, 5]', 4, 50], + ['(0, 2.5]', 1, 60], + ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) + df['range'] = Categorical(df['range'], ordered=True) + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range', ordered=True) + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) + + col = 'range' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + + df['range'] = Categorical(df['range'], ordered=False) + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range') + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) + + index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', + '(0, 2.5]'], + categories=['(7.5, 10]', '(2.5, 5]', + '(5, 7.5]', '(0, 2.5]'], + name='range') + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + index=index, columns=['foo', 'bar']) + + col = 'range' + # this is an unordered categorical, but we allow this #### + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + + +def test_groupby_sort_categorical_datetimelike(): + # GH10505 + + # use same data as test_groupby_sort_categorical, which category is + # corresponding to datetime.month + df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), + datetime(2011, 2, 1), datetime(2011, 5, 1), + datetime(2011, 2, 1), datetime(2011, 1, 1), + datetime(2011, 5, 1)], + 'foo': [10, 8, 5, 6, 4, 1, 7], + 'bar': [10, 20, 30, 40, 50, 60, 70]}, + columns=['dt', 'foo', 'bar']) + + # ordered=True + df['dt'] = Categorical(df['dt'], ordered=True) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt', ordered=True) + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, + name='dt', ordered=True) + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + + # ordered = False + df['dt'] = Categorical(df['dt'], ordered=False) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt') + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, + name='dt') + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + + +def test_groupby_categorical_two_columns(): + + # https://github.com/pandas-dev/pandas/issues/8138 + d = {'cat': + pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), + 'ints': [1, 1, 2, 2], + 'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') + + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) + exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, + index=exp_index) + tm.assert_frame_equal(res, exp) + + # Grouping on two columns + groups_double_key = test.groupby(["cat", "ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), + "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" + ]) + tm.assert_frame_equal(res, exp) + + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = test[(test.cat == c) & (test.ints == i)] + assert_frame_equal(result, expected) + + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = test.groupby([values, 'C2']) + + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product( + [Categorical([Interval(1, 2), Interval(2, 3), + Interval(3, 6)], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) + exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, + nan, nan, nan, nan, 4, 5], + "C3": [nan, nan, nan, nan, 10, 100, + nan, nan, nan, nan, 200, 34]}, index=idx) + tm.assert_frame_equal(res, exp) + + +def test_empty_sum(): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # 0 by default + result = df.groupby("A").B.sum() + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.sum(min_count=0) + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.sum(min_count=1) + expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count>1 + result = df.groupby("A").B.sum(min_count=2) + expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + +def test_empty_prod(): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # 1 by default + result = df.groupby("A").B.prod() + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.prod(min_count=0) + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.prod(min_count=1) + expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index cac6b46af8f87..873d9f6076b69 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -1,622 +1,576 @@ # -*- coding: utf-8 -*- from __future__ import print_function -from numpy import nan - import pytest -from pandas import Timestamp -from pandas.core.index import MultiIndex -from pandas.core.api import DataFrame - -from pandas.core.series import Series - -from pandas.util.testing import (assert_frame_equal, assert_series_equal - ) -from pandas.compat import (lmap) - -from pandas import compat - -import pandas.core.common as com import numpy as np - import pandas.util.testing as tm +from pandas import Timestamp, DataFrame, Series import pandas as pd -class TestGroupByFilter(object): - - def setup_method(self, method): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.mframe = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def test_filter_series(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(s.index)) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(s.index)) - - def test_filter_single_column_df(self): - df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) - grouper = df[0].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(df.index)) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(df.index)) - - def test_filter_multi_column_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), - expected) - - def test_filter_mixed_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 10), expected) - - def test_filter_out_all_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]]) - - def test_filter_out_no_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - filtered = grouped.filter(lambda x: x.mean() > 0) - assert_series_equal(filtered, s) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - filtered = grouped.filter(lambda x: x['A'].mean() > 0) - assert_frame_equal(filtered, df) - - def test_filter_out_all_groups_in_df(self): - # GH12768 - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) - expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3}) - assert_frame_equal(expected, res) - - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) - expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") - assert_frame_equal(expected, res) - - def test_filter_condition_raises(self): - def raise_if_sum_is_zero(x): - if x.sum() == 0: - raise ValueError - else: - return x.sum() > 0 - - s = pd.Series([-1, 0, 1, 2]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - pytest.raises(TypeError, - lambda: grouped.filter(raise_if_sum_is_zero)) - - def test_filter_with_axis_in_groupby(self): - # issue 11041 - index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = pd.DataFrame( - np.arange(100).reshape(-1, 20), columns=index, dtype='int64') - result = data.groupby(level=0, - axis=1).filter(lambda x: x.iloc[0, 0] > 10) - expected = data.iloc[:, 12:20] - assert_frame_equal(result, expected) - - def test_filter_bad_shapes(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby('B') - g_s = s.groupby(s) - - f = lambda x: x - pytest.raises(TypeError, lambda: g_df.filter(f)) - pytest.raises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: x == 1 - pytest.raises(TypeError, lambda: g_df.filter(f)) - pytest.raises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: np.outer(x, x) - pytest.raises(TypeError, lambda: g_df.filter(f)) - pytest.raises(TypeError, lambda: g_s.filter(f)) - - def test_filter_nan_is_false(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby(df['B']) - g_s = s.groupby(s) - - f = lambda x: np.nan - assert_frame_equal(g_df.filter(f), df.loc[[]]) - assert_series_equal(g_s.filter(f), s[[]]) - - def test_filter_against_workaround(self): - np.random.seed(0) - # Series of ints - s = Series(np.random.randint(0, 100, 1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Series of floats - s = 100 * Series(np.random.random(1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Set up DataFrame of ints, floats, strings. - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 1000 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), - 'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - - # Group by ints; filter on floats. - grouped = df.groupby('ints') - old_way = df[grouped.floats. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - # Group by floats (rounded); filter on strings. - grouper = df.floats.apply(lambda x: np.round(x, -1)) - grouped = df.groupby(grouper) - old_way = df[grouped.letters. - transform(lambda x: len(x) < N / 10).astype('bool')] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) - assert_frame_equal(new_way, old_way) - - # Group by strings; filter on ints. - grouped = df.groupby('letters') - old_way = df[grouped.ints. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - def test_filter_using_len(self): - # BUG GH4447 - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - grouped = df.groupby('B') - actual = grouped.filter(lambda x: len(x) > 2) - expected = DataFrame( - {'A': np.arange(2, 6), - 'B': list('bbbb'), - 'C': np.arange(2, 6)}, index=np.arange(2, 6)) - assert_frame_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = df.loc[[]] - assert_frame_equal(actual, expected) - - # Series have always worked properly, but we'll test anyway. - s = df['B'] - grouped = s.groupby(s) - actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') - assert_series_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = s[[]] - assert_series_equal(actual, expected) - - def test_filter_maintains_ordering(self): - # Simple case: index is sequential. #4621 - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - def test_filter_multiple_timestamp(self): - # GH 10114 - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], - 'C': Timestamp('20130101')}) - - grouped = df.groupby(['B', 'C']) - - result = grouped['A'].filter(lambda x: True) - assert_series_equal(df['A'], result) - - result = grouped['A'].transform(len) - expected = Series([2, 3, 2, 3, 3], name='A') - assert_series_equal(result, expected) - - result = grouped.filter(lambda x: True) - assert_frame_equal(df, result) - - result = grouped.transform('sum') - expected = DataFrame({'A': [2, 8, 2, 8, 8]}) - assert_frame_equal(result, expected) - - result = grouped.transform(len) - expected = DataFrame({'A': [2, 3, 2, 3, 3]}) - assert_frame_equal(result, expected) - - def test_filter_and_transform_with_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 1, 1, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_multiple_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 0, 0, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_float_index(self): - # GH4620 - index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_timestamp_index(self): - # GH4620 - t0 = Timestamp('2013-09-30 00:05:00') - t1 = Timestamp('2013-10-30 00:05:00') - t2 = Timestamp('2013-11-30 00:05:00') - index = [t1, t1, t1, t2, t1, t1, t0, t1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_string_index(self): - # GH4620 - index = list('bbbcbbab') - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_has_access_to_grouped_cols(self): - df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - # previously didn't have access to col A #???? - filt = g.filter(lambda x: x['A'].sum() == 2) - assert_frame_equal(filt, df.iloc[[0, 1]]) - - def test_filter_enforces_scalarness(self): - df = pd.DataFrame([ - ['best', 'a', 'x'], - ['worst', 'b', 'y'], - ['best', 'c', 'x'], - ['best', 'd', 'y'], - ['worst', 'd', 'y'], - ['worst', 'd', 'y'], - ['best', 'd', 'z'], - ], columns=['a', 'b', 'c']) - with tm.assert_raises_regex(TypeError, - 'filter function returned a.*'): - df.groupby('c').filter(lambda g: g['a'] == 'best') - - def test_filter_non_bool_raises(self): - df = pd.DataFrame([ - ['best', 'a', 1], - ['worst', 'b', 1], - ['best', 'c', 1], - ['best', 'd', 1], - ['worst', 'd', 1], - ['worst', 'd', 1], - ['best', 'd', 1], - ], columns=['a', 'b', 'c']) - with tm.assert_raises_regex(TypeError, - 'filter function returned a.*'): - df.groupby('a').filter(lambda g: g.c.mean()) - - def test_filter_dropna_with_empty_groups(self): - # GH 10780 - data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) - groupped = data.groupby(level=0) - result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) - expected_false = pd.Series([np.nan] * 9, - index=np.repeat([1, 2, 3], 3)) - tm.assert_series_equal(result_false, expected_false) - - result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) - expected_true = pd.Series(index=pd.Index([], dtype=int)) - tm.assert_series_equal(result_true, expected_true) - - -def assert_fp_equal(a, b): - assert (np.abs(a - b) < 1e-12).all() - - -def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): - tups = lmap(tuple, df[keys].values) - tups = com._asarray_tuplesafe(tups) - expected = f(df.groupby(tups)[field]) - for k, v in compat.iteritems(expected): - assert (result[k] == v) +def test_filter_series(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(s.index)) + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(s.index)) + + +def test_filter_single_column_df(): + df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + grouper = df[0].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(df.index)) + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(df.index)) + + +def test_filter_multi_column_df(): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + tm.assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), + expected) + + +def test_filter_mixed_df(): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) + tm.assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 10), expected) + + +def test_filter_out_all_groups(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + tm.assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]]) + + +def test_filter_out_no_groups(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + filtered = grouped.filter(lambda x: x.mean() > 0) + tm.assert_series_equal(filtered, s) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + filtered = grouped.filter(lambda x: x['A'].mean() > 0) + tm.assert_frame_equal(filtered, df) + + +def test_filter_out_all_groups_in_df(): + # GH12768 + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) + expected = pd.DataFrame({'a': [np.nan] * 3, 'b': [np.nan] * 3}) + tm.assert_frame_equal(expected, res) + + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) + expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") + tm.assert_frame_equal(expected, res) + + +def test_filter_condition_raises(): + def raise_if_sum_is_zero(x): + if x.sum() == 0: + raise ValueError + else: + return x.sum() > 0 + + s = pd.Series([-1, 0, 1, 2]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + pytest.raises(TypeError, + lambda: grouped.filter(raise_if_sum_is_zero)) + + +def test_filter_with_axis_in_groupby(): + # issue 11041 + index = pd.MultiIndex.from_product([range(10), [0, 1]]) + data = pd.DataFrame( + np.arange(100).reshape(-1, 20), columns=index, dtype='int64') + result = data.groupby(level=0, + axis=1).filter(lambda x: x.iloc[0, 0] > 10) + expected = data.iloc[:, 12:20] + tm.assert_frame_equal(result, expected) + + +def test_filter_bad_shapes(): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby('B') + g_s = s.groupby(s) + + f = lambda x: x + pytest.raises(TypeError, lambda: g_df.filter(f)) + pytest.raises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: x == 1 + pytest.raises(TypeError, lambda: g_df.filter(f)) + pytest.raises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: np.outer(x, x) + pytest.raises(TypeError, lambda: g_df.filter(f)) + pytest.raises(TypeError, lambda: g_s.filter(f)) + + +def test_filter_nan_is_false(): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby(df['B']) + g_s = s.groupby(s) + + f = lambda x: np.nan + tm.assert_frame_equal(g_df.filter(f), df.loc[[]]) + tm.assert_series_equal(g_s.filter(f), s[[]]) + + +def test_filter_against_workaround(): + np.random.seed(0) + # Series of ints + s = Series(np.random.randint(0, 100, 1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Series of floats + s = 100 * Series(np.random.random(1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Set up DataFrame of ints, floats, strings. + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 1000 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), + 'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + + # Group by ints; filter on floats. + grouped = df.groupby('ints') + old_way = df[grouped.floats. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) + tm.assert_frame_equal(new_way, old_way) + + # Group by floats (rounded); filter on strings. + grouper = df.floats.apply(lambda x: np.round(x, -1)) + grouped = df.groupby(grouper) + old_way = df[grouped.letters. + transform(lambda x: len(x) < N / 10).astype('bool')] + new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + tm.assert_frame_equal(new_way, old_way) + + # Group by strings; filter on ints. + grouped = df.groupby('letters') + old_way = df[grouped.ints. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) + tm.assert_frame_equal(new_way, old_way) + + +def test_filter_using_len(): + # BUG GH4447 + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + grouped = df.groupby('B') + actual = grouped.filter(lambda x: len(x) > 2) + expected = DataFrame( + {'A': np.arange(2, 6), + 'B': list('bbbb'), + 'C': np.arange(2, 6)}, index=np.arange(2, 6)) + tm.assert_frame_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = df.loc[[]] + tm.assert_frame_equal(actual, expected) + + # Series have always worked properly, but we'll test anyway. + s = df['B'] + grouped = s.groupby(s) + actual = grouped.filter(lambda x: len(x) > 2) + expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') + tm.assert_series_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = s[[]] + tm.assert_series_equal(actual, expected) + + +def test_filter_maintains_ordering(): + # Simple case: index is sequential. #4621 + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + # Now index is sequentially decreasing. + df.index = np.arange(len(df) - 1, -1, -1) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + # Index is shuffled. + SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] + df.index = df.index[SHUFFLED] + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + +def test_filter_multiple_timestamp(): + # GH 10114 + df = DataFrame({'A': np.arange(5, dtype='int64'), + 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], + 'C': Timestamp('20130101')}) + + grouped = df.groupby(['B', 'C']) + + result = grouped['A'].filter(lambda x: True) + tm.assert_series_equal(df['A'], result) + + result = grouped['A'].transform(len) + expected = Series([2, 3, 2, 3, 3], name='A') + tm.assert_series_equal(result, expected) + + result = grouped.filter(lambda x: True) + tm.assert_frame_equal(df, result) + + result = grouped.transform('sum') + expected = DataFrame({'A': [2, 8, 2, 8, 8]}) + tm.assert_frame_equal(result, expected) + + result = grouped.transform(len) + expected = DataFrame({'A': [2, 3, 2, 3, 3]}) + tm.assert_frame_equal(result, expected) + + +def test_filter_and_transform_with_non_unique_int_index(): + # GH4620 + index = [1, 1, 1, 2, 1, 1, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_multiple_non_unique_int_index(): + # GH4620 + index = [1, 1, 1, 2, 0, 0, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_float_index(): + # GH4620 + index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_timestamp_index(): + # GH4620 + t0 = Timestamp('2013-09-30 00:05:00') + t1 = Timestamp('2013-10-30 00:05:00') + t2 = Timestamp('2013-11-30 00:05:00') + index = [t1, t1, t1, t2, t1, t1, t0, t1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_string_index(): + # GH4620 + index = list('bbbcbbab') + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_has_access_to_grouped_cols(): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x['A'].sum() == 2) + tm.assert_frame_equal(filt, df.iloc[[0, 1]]) + + +def test_filter_enforces_scalarness(): + df = pd.DataFrame([ + ['best', 'a', 'x'], + ['worst', 'b', 'y'], + ['best', 'c', 'x'], + ['best', 'd', 'y'], + ['worst', 'd', 'y'], + ['worst', 'd', 'y'], + ['best', 'd', 'z'], + ], columns=['a', 'b', 'c']) + with tm.assert_raises_regex(TypeError, + 'filter function returned a.*'): + df.groupby('c').filter(lambda g: g['a'] == 'best') + + +def test_filter_non_bool_raises(): + df = pd.DataFrame([ + ['best', 'a', 1], + ['worst', 'b', 1], + ['best', 'c', 1], + ['best', 'd', 1], + ['worst', 'd', 1], + ['worst', 'd', 1], + ['best', 'd', 1], + ], columns=['a', 'b', 'c']) + with tm.assert_raises_regex(TypeError, + 'filter function returned a.*'): + df.groupby('a').filter(lambda g: g.c.mean()) + + +def test_filter_dropna_with_empty_groups(): + # GH 10780 + data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) + groupped = data.groupby(level=0) + result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) + expected_false = pd.Series([np.nan] * 9, + index=np.repeat([1, 2, 3], 3)) + tm.assert_series_equal(result_false, expected_false) + + result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) + expected_true = pd.Series(index=pd.Index([], dtype=int)) + tm.assert_series_equal(result_true, expected_true) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py new file mode 100644 index 0000000000000..ba1371fe9f931 --- /dev/null +++ b/pandas/tests/groupby/test_function.py @@ -0,0 +1,1120 @@ +import pytest + +import numpy as np +import pandas as pd +from pandas import (DataFrame, Index, compat, isna, + Series, MultiIndex, Timestamp, date_range) +from pandas.errors import UnsupportedFunctionCall +from pandas.util import testing as tm +import pandas.core.nanops as nanops +from string import ascii_lowercase +from pandas.compat import product as cart_product + + +@pytest.mark.parametrize("agg_func", ['any', 'all']) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("vals", [ + ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], + [1, 2, 3], [1, 0, 0], [0, 0, 0], + [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], + [True, True, True], [True, False, False], [False, False, False], + [np.nan, np.nan, np.nan] +]) +def test_groupby_bool_aggs(agg_func, skipna, vals): + df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(compat.builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == 'any': + exp = False + + exp_df = DataFrame([exp] * 2, columns=['val'], index=Index( + ['a', 'b'], name='key')) + result = getattr(df.groupby('key'), agg_func)(skipna=skipna) + tm.assert_frame_equal(result, exp_df) + + +def test_max_min_non_numeric(): + # #2700 + aa = DataFrame({'nn': [11, 11, 22, 22], + 'ii': [1, 2, 3, 4], + 'ss': 4 * ['mama']}) + + result = aa.groupby('nn').max() + assert 'ss' in result + + result = aa.groupby('nn').max(numeric_only=False) + assert 'ss' in result + + result = aa.groupby('nn').min() + assert 'ss' in result + + result = aa.groupby('nn').min(numeric_only=False) + assert 'ss' in result + + +def test_intercept_builtin_sum(): + s = Series([1., 2., np.nan, 3.]) + grouped = s.groupby([0, 1, 2, 2]) + + result = grouped.agg(compat.builtins.sum) + result2 = grouped.apply(compat.builtins.sum) + expected = grouped.sum() + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + +def test_builtins_apply(): # GH8155 + df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), + columns=['jim', 'joe']) + df['jolie'] = np.random.randn(1000) + + for keys in ['jim', ['jim', 'joe']]: # single key & multi-key + if keys == 'jim': + continue + for f in [max, min, sum]: + fname = f.__name__ + result = df.groupby(keys).apply(f) + result.shape + ngroups = len(df.drop_duplicates(subset=keys)) + assert result.shape == (ngroups, 3), 'invalid frame shape: '\ + '{} (expected ({}, 3))'.format(result.shape, ngroups) + + tm.assert_frame_equal(result, # numpy's equivalent function + df.groupby(keys).apply(getattr(np, fname))) + + if f != sum: + expected = df.groupby(keys).agg(fname).reset_index() + expected.set_index(keys, inplace=True, drop=False) + tm.assert_frame_equal(result, expected, check_dtype=False) + + tm.assert_series_equal(getattr(result, fname)(), + getattr(df, fname)()) + + +def test_arg_passthru(): + # make sure that we are passing thru kwargs + # to our agg functions + + # GH3668 + # GH5724 + df = pd.DataFrame( + {'group': [1, 1, 2], + 'int': [1, 2, 3], + 'float': [4., 5., 6.], + 'string': list('abc'), + 'category_string': pd.Series(list('abc')).astype('category'), + 'category_int': [7, 8, 9], + 'datetime': pd.date_range('20130101', periods=3), + 'datetimetz': pd.date_range('20130101', + periods=3, + tz='US/Eastern'), + 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, + columns=['group', 'int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + + expected_columns_numeric = Index(['int', 'float', 'category_int']) + + # mean / median + expected = pd.DataFrame( + {'category_int': [7.5, 9], + 'float': [4.5, 6.], + 'timedelta': [pd.Timedelta('1.5s'), + pd.Timedelta('3s')], + 'int': [1.5, 3], + 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), + pd.Timestamp('2013-01-03 00:00:00')], + 'datetimetz': [ + pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), + pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, + index=Index([1, 2], name='group'), + columns=['int', 'float', 'category_int', + 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + # TODO: min, max *should* handle + # categorical (ordered) dtype + expected_columns = Index(['int', 'float', 'string', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['min', 'max']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['first', 'last']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_int', 'timedelta']) + for attr in ['sum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int']) + for attr in ['prod', 'cumprod']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + # like min, max, but don't include strings + expected_columns = Index(['int', 'float', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['cummin', 'cummax']: + f = getattr(df.groupby('group'), attr) + result = f() + # GH 15561: numeric_only=False set by default like min/max + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int', + 'timedelta']) + for attr in ['cumsum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + +def test_non_cython_api(): + + # GH5610 + # non-cython calls should not include the grouper + + df = DataFrame( + [[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, 'baz']], + columns=['A', 'B', 'C']) + g = df.groupby('A') + gni = df.groupby('A', as_index=False) + + # mad + expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) + expected.index.name = 'A' + result = g.mad() + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], + index=[0, 1]) + result = gni.mad() + tm.assert_frame_equal(result, expected) + + # describe + expected_index = pd.Index([1, 3], name='A') + expected_col = pd.MultiIndex(levels=[['B'], + ['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']], + labels=[[0] * 8, list(range(8))]) + expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan]], + index=expected_index, + columns=expected_col) + result = g.describe() + tm.assert_frame_equal(result, expected) + + expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T]) + expected.index = pd.Index([0, 1]) + result = gni.describe() + tm.assert_frame_equal(result, expected) + + # any + expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'], + index=[1, 3]) + expected.index.name = 'A' + result = g.any() + tm.assert_frame_equal(result, expected) + + # idxmax + expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) + expected.index.name = 'A' + result = g.idxmax() + tm.assert_frame_equal(result, expected) + + +def test_cython_api2(): + + # this takes the fast apply path + + # cumsum (GH5614) + df = DataFrame( + [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] + ], columns=['A', 'B', 'C']) + expected = DataFrame( + [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) + result = df.groupby('A').cumsum() + tm.assert_frame_equal(result, expected) + + # GH 5755 - cumsum is a transformer and should ignore as_index + result = df.groupby('A', as_index=False).cumsum() + tm.assert_frame_equal(result, expected) + + # GH 13994 + result = df.groupby('A').cumsum(axis=1) + expected = df.cumsum(axis=1) + tm.assert_frame_equal(result, expected) + result = df.groupby('A').cumprod(axis=1) + expected = df.cumprod(axis=1) + tm.assert_frame_equal(result, expected) + + +def test_cython_median(): + df = DataFrame(np.random.randn(1000)) + df.values[::2] = np.nan + + labels = np.random.randint(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + exp = df.groupby(labels).agg(nanops.nanmedian) + tm.assert_frame_equal(result, exp) + + df = DataFrame(np.random.randn(1000, 5)) + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + tm.assert_frame_equal(rs, xp) + + +def test_median_empty_bins(): + df = pd.DataFrame(np.random.randint(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins).median() + expected = df.groupby(bins).agg(lambda x: x.median()) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [ + 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) +@pytest.mark.parametrize("method,data", [ + ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}), + ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}) +]) +def test_groupby_non_arithmetic_agg_types(dtype, method, data): + # GH9311, GH6620 + df = pd.DataFrame( + [{'a': 1, 'b': 1}, + {'a': 1, 'b': 2}, + {'a': 2, 'b': 3}, + {'a': 2, 'b': 4}]) + + df['b'] = df.b.astype(dtype) + + if 'args' not in data: + data['args'] = [] + + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype + + exp = data['df'] + df_out = pd.DataFrame(exp) + + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) + + grpd = df.groupby('a') + t = getattr(grpd, method)(*data['args']) + tm.assert_frame_equal(t, df_out) + + +def test_groupby_non_arithmetic_agg_intlike_precision(): + # GH9311, GH6620 + c = 24650000000000000 + + inputs = ((Timestamp('2011-01-15 12:50:28.502376'), + Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c)) + + for i in inputs: + df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}]) + + grp_exp = {'first': {'expected': i[0]}, + 'last': {'expected': i[1]}, + 'min': {'expected': i[0]}, + 'max': {'expected': i[1]}, + 'nth': {'expected': i[1], + 'args': [1]}, + 'count': {'expected': 2}} + + for method, data in compat.iteritems(grp_exp): + if 'args' not in data: + data['args'] = [] + + grpd = df.groupby('a') + res = getattr(grpd, method)(*data['args']) + assert res.iloc[0].b == data['expected'] + + +def test_fill_constistency(): + + # GH9221 + # pass thru keyword arguments to the generated wrapper + # are set if the passed kw is None (only) + df = DataFrame(index=pd.MultiIndex.from_product( + [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]), + columns=Index( + ['1', '2'], name='id')) + df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan, + np.nan, 22, np.nan] + df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan, + np.nan, 44, np.nan] + + expected = df.groupby(level=0, axis=0).fillna(method='ffill') + result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T + tm.assert_frame_equal(result, expected) + + +def test_groupby_cumprod(): + # GH 4095 + df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) + + actual = df.groupby('key')['value'].cumprod() + expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) + expected.name = 'value' + tm.assert_series_equal(actual, expected) + + df = pd.DataFrame({'key': ['b'] * 100, 'value': 2}) + actual = df.groupby('key')['value'].cumprod() + # if overflows, groupby product casts to float + # while numpy passes back invalid values + df['value'] = df['value'].astype(float) + expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) + expected.name = 'value' + tm.assert_series_equal(actual, expected) + + +def test_ops_general(): + ops = [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), ] + try: + from scipy.stats import sem + except ImportError: + pass + else: + ops.append(('sem', sem)) + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op, ) + raise + + +def test_max_nan_bug(): + raw = """,Date,app,File +-04-23,2013-04-23 00:00:00,,log080001.log +-05-06,2013-05-06 00:00:00,,log.log +-05-07,2013-05-07 00:00:00,OE,xlsx""" + + df = pd.read_csv(compat.StringIO(raw), parse_dates=[0]) + gb = df.groupby('Date') + r = gb[['File']].max() + e = gb['File'].max().to_frame() + tm.assert_frame_equal(r, e) + assert not r['File'].isna().any() + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list('a' * 5 + 'b' * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series([ + 7, 5, 3, 10, 9, 6 + ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]])) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series([ + 3, 2, 1, 3, 3, 2 + ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) + tm.assert_series_equal(gb.nlargest(3, keep='last'), e) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list('a' * 5 + 'b' * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series([ + 1, 2, 3, 0, 4, 6 + ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series([ + 0, 1, 1, 0, 1, 2 + ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) + tm.assert_series_equal(gb.nsmallest(3, keep='last'), e) + + +def test_numpy_compat(): + # see gh-12811 + df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) + g = df.groupby('A') + + msg = "numpy operations are not valid with groupby" + + for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'): + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(g, func), 1, 2, 3) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(g, func), foo=1) + + +def test_cummin_cummax(): + # GH 15048 + num_types = [np.int32, np.int64, np.float32, np.float64] + num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, + np.finfo(np.float32).min, np.finfo(np.float64).min] + num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, + np.finfo(np.float32).max, np.finfo(np.float64).max] + base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], + 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + for dtype, min_val, max_val in zip(num_types, num_mins, num_max): + df = base_df.astype(dtype) + + # cummin + expected = pd.DataFrame({'B': expected_mins}).astype(dtype) + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummin w/ min value for dtype + df.loc[[2, 6], 'B'] = min_val + expected.loc[[2, 3, 6, 7], 'B'] = min_val + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # cummax + expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummax w/ max value for dtype + df.loc[[2, 6], 'B'] = max_val + expected.loc[[2, 3, 6, 7], 'B'] = max_val + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test nan in some values + base_df.loc[[0, 2, 4, 6], 'B'] = np.nan + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, + np.nan, 3, np.nan, 1]}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummin()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, + np.nan, 3, np.nan, 3]}) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummax()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + # Test nan in entire column + base_df['B'] = np.nan + expected = pd.DataFrame({'B': [np.nan] * 8}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(expected, result) + + # GH 15561 + df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) + expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') + for method in ['cummax', 'cummin']: + result = getattr(df.groupby('a')['b'], method)() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) + result = df.groupby('a').b.cummax() + expected = pd.Series([2, 1, 2], name='b') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) + result = df.groupby('a').b.cummin() + expected = pd.Series([1, 2, 1], name='b') + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('in_vals, out_vals', [ + + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], + [True, False, False, True]), + + # Test with inf vals + ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False]), + + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), +]) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + df = pd.DataFrame(source_dict) + result = df.groupby('B').C.is_monotonic_increasing + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = ( + df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('in_vals, out_vals', [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], + [True, False, False, True]), + + # Test with inf vals + ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True]), + + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), +]) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + + df = pd.DataFrame(source_dict) + result = df.groupby('B').C.is_monotonic_decreasing + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') + tm.assert_series_equal(result, expected) + + +# describe +# -------------------------------- + +def test_apply_describe_bug(mframe): + grouped = mframe.groupby(level='first') + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result['mean'], grouped.mean(), + check_names=False) + tm.assert_series_equal(result['std'], grouped.std(), check_names=False) + tm.assert_series_equal(result['min'], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack() + tm.assert_series_equal(result, expected) + + +def test_series_index_name(df): + grouped = df.loc[:, ['C']].groupby(df['A']) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == 'A' + + +def test_frame_describe_multikey(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = pd.MultiIndex( + levels=[[col], group.columns], + labels=[[0] * len(group.columns), range(len(group.columns))]) + group = pd.DataFrame(group.values, + columns=group_col, + index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + groupedT = tsframe.groupby({'A': 0, 'B': 0, + 'C': 1, 'D': 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + expected.index = pd.MultiIndex( + levels=[[0, 1], expected.index], + labels=[[0, 0, 1, 1], range(len(expected.index))]) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, + 'y': [10, 20, 30, 40, 50] * 3, + 'z': [100, 200, 300, 400, 500] * 3}) + df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={'k': 'key'}) + pytest.raises(ValueError, lambda: df1.groupby('k').describe()) + pytest.raises(ValueError, lambda: df2.groupby('key').describe()) + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} + volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} + df = pd.DataFrame({'PRICE': prices, + 'VOLUME': volumes}) + result = df.groupby('PRICE').VOLUME.describe() + data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist()] + expected = pd.DataFrame(data, + index=pd.Index([24990, 25499], name='PRICE'), + columns=['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) + + +# nunique +# -------------------------------- + +@pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6), + (10, 100, 1000))) +@pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2)) +def test_series_groupby_nunique(n, m, sort, dropna): + + def check_nunique(df, keys, as_index=True): + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr['julie'].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr['julie'].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + tm.assert_series_equal(left, right, check_names=False) + + days = date_range('2015-08-23', periods=10) + + frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n), + 'joe': np.random.choice(days, n), + 'julie': np.random.randint(0, m, n)}) + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + + frame.loc[1::17, 'jim'] = None + frame.loc[3::37, 'joe'] = None + frame.loc[7::19, 'julie'] = None + frame.loc[8::19, 'julie'] = None + frame.loc[9::19, 'julie'] = None + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ['jim'], as_index=False) + check_nunique(frame, ['jim', 'joe'], as_index=False) + + +def test_nunique(): + df = DataFrame({ + 'A': list('abbacc'), + 'B': list('abxacc'), + 'C': list('abbacx'), + }) + + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) + result = df.groupby('A', as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list('abc') + expected.index.name = 'A' + result = df.groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, + index=list('abc')) + expected.index.name = 'A' + result = df.replace({'x': None}).groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + +def test_nunique_with_object(): + # GH 11077 + data = pd.DataFrame( + [[100, 1, 'Alice'], + [200, 2, 'Bob'], + [300, 3, 'Charlie'], + [-400, 4, 'Dan'], + [500, 5, 'Edith']], + columns=['amount', 'id', 'name'] + ) + + result = data.groupby(['id', 'amount'])['name'].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = pd.Series([1] * 5, name='name', index=index) + tm.assert_series_equal(result, expected) + + +def test_nunique_with_empty_series(): + # GH 12553 + data = pd.Series(name='name') + result = data.groupby(level=0).nunique() + expected = pd.Series(name='name', dtype='int64') + tm.assert_series_equal(result, expected) + + +def test_nunique_with_timegrouper(): + # GH 13453 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + Timestamp('2016-06-28 16:09:30'), + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}).set_index('time') + result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() + expected = test.groupby( + pd.Grouper(freq='h') + )['data'].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + + +# count +# -------------------------------- + +def test_groupby_timedelta_cython_count(): + df = DataFrame({'g': list('ab' * 2), + 'delt': np.arange(4).astype('timedelta64[ns]')}) + expected = Series([ + 2, 2 + ], index=pd.Index(['a', 'b'], name='g'), name='delt') + result = df.groupby('g').delt.count() + tm.assert_series_equal(expected, result) + + +def test_count(): + n = 1 << 15 + dr = date_range('2015-08-30', periods=n // 10, freq='T') + + df = DataFrame({ + '1st': np.random.choice( + list(ascii_lowercase), n), + '2nd': np.random.randint(0, 5, n), + '3rd': np.random.randn(n).round(3), + '4th': np.random.randint(-10, 10, n), + '5th': np.random.choice(dr, n), + '6th': np.random.randn(n).round(3), + '7th': np.random.randn(n).round(3), + '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), + '9th': np.random.choice( + list(ascii_lowercase), n) + }) + + for col in df.columns.drop(['1st', '2nd', '4th']): + df.loc[np.random.choice(n, n // 10), col] = np.nan + + df['9th'] = df['9th'].astype('category') + + for key in '1st', '2nd', ['1st', '2nd']: + left = df.groupby(key).count() + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + tm.assert_frame_equal(left, right) + + # GH5610 + # count counts non-nulls + df = pd.DataFrame([[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, np.nan]], + columns=['A', 'B', 'C']) + + count_as = df.groupby('A').count() + count_not_as = df.groupby('A', as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], + index=[1, 3]) + expected.index.name = 'A' + tm.assert_frame_equal(count_not_as, expected.reset_index()) + tm.assert_frame_equal(count_as, expected) + + count_B = df.groupby('A')['B'].count() + tm.assert_series_equal(count_B, expected['B']) + + +def test_count_object(): + df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 3, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 1, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + +def test_count_cross_type(): + # GH8169 + vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( + 0, 2, (100, 2)))) + + df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) + df[df == 2] = np.nan + expected = df.groupby(['c', 'd']).count() + + for t in ['float32', 'object']: + df['a'] = df['a'].astype(t) + df['b'] = df['b'].astype(t) + result = df.groupby(['c', 'd']).count() + tm.assert_frame_equal(result, expected) + + +def test_lower_int_prec_count(): + df = DataFrame({'a': np.array( + [0, 1, 2, 100], np.int8), + 'b': np.array( + [1, 2, 3, 6], np.uint32), + 'c': np.array( + [4, 5, 6, 8], np.int16), + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2], + 'b': [2, 2], + 'c': [2, 2]}, index=pd.Index(list('ab'), + name='grp')) + tm.assert_frame_equal(result, expected) + + +def test_count_uses_size_on_exception(): + class RaisingObjectException(Exception): + pass + + class RaisingObject(object): + + def __init__(self, msg='I will raise inside Cython'): + super(RaisingObject, self).__init__() + self.msg = msg + + def __eq__(self, other): + # gets called in Cython to check that raising calls the method + raise RaisingObjectException(self.msg) + + df = DataFrame({'a': [RaisingObject() for _ in range(4)], + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2]}, index=pd.Index( + list('ab'), name='grp')) + tm.assert_frame_equal(result, expected) + + +# size +# -------------------------------- + +def test_size(df): + grouped = df.groupby(['A', 'B']) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = df.groupby('A') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = df.groupby('B') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) + for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): + left = df.groupby(key, sort=sort).size() + right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) + + # GH11699 + df = DataFrame([], columns=['A', 'B']) + out = Series([], dtype='int64', index=Index([], name='A')) + tm.assert_series_equal(df.groupby('A').size(), out) + + +# pipe +# -------------------------------- + +def test_pipe(): + # Test the pipe method of DataFrameGroupBy. + # Issue #17871 + + random_state = np.random.RandomState(1234567890) + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': random_state.randn(8), + 'C': random_state.randn(8)}) + + def f(dfgb): + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + return srs ** 2 + + # Note that the transformations are + # GroupBy -> Series + # Series -> Series + # This then chains the GroupBy.pipe and the + # NDFrame.pipe methods + result = df.groupby('A').pipe(f).pipe(square) + + index = Index([u'bar', u'foo'], dtype='object', name=u'A') + expected = pd.Series([8.99110003361, 8.17516964785], name='B', + index=index) + + tm.assert_series_equal(expected, result) + + +def test_pipe_args(): + # Test passing args to the pipe method of DataFrameGroupBy. + # Issue #17871 + + df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], + 'x': [1.0, 2.0, 3.0, 2.0, 5.0], + 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) + + def f(dfgb, arg1): + return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) + .groupby(dfgb.grouper)) + + def g(dfgb, arg2): + return dfgb.sum() / dfgb.sum().sum() + arg2 + + def h(df, arg3): + return df.x + df.y - arg3 + + result = (df + .groupby('group') + .pipe(f, 0) + .pipe(g, 10) + .pipe(h, 100)) + + # Assert the results here + index = pd.Index(['A', 'B', 'C'], name='group') + expected = pd.Series([-79.5160891089, -78.4839108911, -80], + index=index) + + tm.assert_series_equal(expected, result) + + # test SeriesGroupby.pipe + ser = pd.Series([1, 1, 2, 2, 3, 3]) + result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) + + expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py deleted file mode 100644 index b9718663570bd..0000000000000 --- a/pandas/tests/groupby/test_functional.py +++ /dev/null @@ -1,372 +0,0 @@ -# -*- coding: utf-8 -*- - -""" test function application """ - -import pytest - -from string import ascii_lowercase -from pandas import (date_range, Timestamp, - Index, MultiIndex, DataFrame, Series) -from pandas.util.testing import assert_frame_equal, assert_series_equal -from pandas.compat import product as cart_product - -import numpy as np - -import pandas.util.testing as tm -import pandas as pd -from .common import MixIn - - -# describe -# -------------------------------- - -class TestDescribe(MixIn): - - def test_apply_describe_bug(self): - grouped = self.mframe.groupby(level='first') - grouped.describe() # it works! - - def test_series_describe_multikey(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - assert_series_equal(result['mean'], grouped.mean(), check_names=False) - assert_series_equal(result['std'], grouped.std(), check_names=False) - assert_series_equal(result['min'], grouped.min(), check_names=False) - - def test_series_describe_single(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack() - assert_series_equal(result, expected) - - def test_series_index_name(self): - grouped = self.df.loc[:, ['C']].groupby(self.df['A']) - result = grouped.agg(lambda x: x.mean()) - assert result.index.name == 'A' - - def test_frame_describe_multikey(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in self.tsframe: - group = grouped[col].describe() - # GH 17464 - Remove duplicate MultiIndex levels - group_col = pd.MultiIndex( - levels=[[col], group.columns], - labels=[[0] * len(group.columns), range(len(group.columns))]) - group = pd.DataFrame(group.values, - columns=group_col, - index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - groupedT = self.tsframe.groupby({'A': 0, 'B': 0, - 'C': 1, 'D': 1}, axis=1) - result = groupedT.describe() - expected = self.tsframe.describe().T - expected.index = pd.MultiIndex( - levels=[[0, 1], expected.index], - labels=[[0, 0, 1, 1], range(len(expected.index))]) - tm.assert_frame_equal(result, expected) - - def test_frame_describe_tupleindex(self): - - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, - 'y': [10, 20, 30, 40, 50] * 3, - 'z': [100, 200, 300, 400, 500] * 3}) - df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={'k': 'key'}) - pytest.raises(ValueError, lambda: df1.groupby('k').describe()) - pytest.raises(ValueError, lambda: df2.groupby('key').describe()) - - def test_frame_describe_unstacked_format(self): - # GH 4792 - prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} - volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} - df = pd.DataFrame({'PRICE': prices, - 'VOLUME': volumes}) - result = df.groupby('PRICE').VOLUME.describe() - data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist()] - expected = pd.DataFrame(data, - index=pd.Index([24990, 25499], name='PRICE'), - columns=['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']) - tm.assert_frame_equal(result, expected) - - -# nunique -# -------------------------------- - -class TestNUnique(MixIn): - - def test_series_groupby_nunique(self): - - def check_nunique(df, keys, as_index=True): - for sort, dropna in cart_product((False, True), repeat=2): - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr['julie'].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr['julie'].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - assert_series_equal(left, right, check_names=False) - - days = date_range('2015-08-23', periods=10) - - for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): - frame = DataFrame({ - 'jim': np.random.choice( - list(ascii_lowercase), n), - 'joe': np.random.choice(days, n), - 'julie': np.random.randint(0, m, n) - }) - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - - frame.loc[1::17, 'jim'] = None - frame.loc[3::37, 'joe'] = None - frame.loc[7::19, 'julie'] = None - frame.loc[8::19, 'julie'] = None - frame.loc[9::19, 'julie'] = None - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - check_nunique(frame, ['jim'], as_index=False) - check_nunique(frame, ['jim', 'joe'], as_index=False) - - def test_nunique(self): - df = DataFrame({ - 'A': list('abbacc'), - 'B': list('abxacc'), - 'C': list('abbacx'), - }) - - expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) - result = df.groupby('A', as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list('abc') - expected.index.name = 'A' - result = df.groupby('A').nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({'x': None}).groupby('A').nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, - index=list('abc')) - expected.index.name = 'A' - result = df.replace({'x': None}).groupby('A').nunique() - tm.assert_frame_equal(result, expected) - - def test_nunique_with_object(self): - # GH 11077 - data = pd.DataFrame( - [[100, 1, 'Alice'], - [200, 2, 'Bob'], - [300, 3, 'Charlie'], - [-400, 4, 'Dan'], - [500, 5, 'Edith']], - columns=['amount', 'id', 'name'] - ) - - result = data.groupby(['id', 'amount'])['name'].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name='name', index=index) - tm.assert_series_equal(result, expected) - - def test_nunique_with_empty_series(self): - # GH 12553 - data = pd.Series(name='name') - result = data.groupby(level=0).nunique() - expected = pd.Series(name='name', dtype='int64') - tm.assert_series_equal(result, expected) - - def test_nunique_with_timegrouper(self): - # GH 13453 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - Timestamp('2016-06-28 16:09:30'), - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}).set_index('time') - result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() - expected = test.groupby( - pd.Grouper(freq='h') - )['data'].apply(pd.Series.nunique) - tm.assert_series_equal(result, expected) - - -# count -# -------------------------------- - -class TestCount(MixIn): - - def test_groupby_timedelta_cython_count(self): - df = DataFrame({'g': list('ab' * 2), - 'delt': np.arange(4).astype('timedelta64[ns]')}) - expected = Series([ - 2, 2 - ], index=pd.Index(['a', 'b'], name='g'), name='delt') - result = df.groupby('g').delt.count() - tm.assert_series_equal(expected, result) - - def test_count(self): - n = 1 << 15 - dr = date_range('2015-08-30', periods=n // 10, freq='T') - - df = DataFrame({ - '1st': np.random.choice( - list(ascii_lowercase), n), - '2nd': np.random.randint(0, 5, n), - '3rd': np.random.randn(n).round(3), - '4th': np.random.randint(-10, 10, n), - '5th': np.random.choice(dr, n), - '6th': np.random.randn(n).round(3), - '7th': np.random.randn(n).round(3), - '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), - '9th': np.random.choice( - list(ascii_lowercase), n) - }) - - for col in df.columns.drop(['1st', '2nd', '4th']): - df.loc[np.random.choice(n, n // 10), col] = np.nan - - df['9th'] = df['9th'].astype('category') - - for key in '1st', '2nd', ['1st', '2nd']: - left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) - assert_frame_equal(left, right) - - # GH5610 - # count counts non-nulls - df = pd.DataFrame([[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, np.nan]], - columns=['A', 'B', 'C']) - - count_as = df.groupby('A').count() - count_not_as = df.groupby('A', as_index=False).count() - - expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' - assert_frame_equal(count_not_as, expected.reset_index()) - assert_frame_equal(count_as, expected) - - count_B = df.groupby('A')['B'].count() - assert_series_equal(count_B, expected['B']) - - def test_count_object(self): - df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 3, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, - 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 1, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - def test_count_cross_type(self): # GH8169 - vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( - 0, 2, (100, 2)))) - - df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) - df[df == 2] = np.nan - expected = df.groupby(['c', 'd']).count() - - for t in ['float32', 'object']: - df['a'] = df['a'].astype(t) - df['b'] = df['b'].astype(t) - result = df.groupby(['c', 'd']).count() - tm.assert_frame_equal(result, expected) - - def test_lower_int_prec_count(self): - df = DataFrame({'a': np.array( - [0, 1, 2, 100], np.int8), - 'b': np.array( - [1, 2, 3, 6], np.uint32), - 'c': np.array( - [4, 5, 6, 8], np.int16), - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2], - 'b': [2, 2], - 'c': [2, 2]}, index=pd.Index(list('ab'), - name='grp')) - tm.assert_frame_equal(result, expected) - - def test_count_uses_size_on_exception(self): - class RaisingObjectException(Exception): - pass - - class RaisingObject(object): - - def __init__(self, msg='I will raise inside Cython'): - super(RaisingObject, self).__init__() - self.msg = msg - - def __eq__(self, other): - # gets called in Cython to check that raising calls the method - raise RaisingObjectException(self.msg) - - df = DataFrame({'a': [RaisingObject() for _ in range(4)], - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2]}, index=pd.Index( - list('ab'), name='grp')) - tm.assert_frame_equal(result, expected) - - -# size -# -------------------------------- - -class TestSize(MixIn): - - def test_size(self): - grouped = self.df.groupby(['A', 'B']) - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('A') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('B') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) - for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): - left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) - assert_series_equal(left, right, check_names=False) - - # GH11699 - df = DataFrame([], columns=['A', 'B']) - out = Series([], dtype='int64', index=Index([], name='A')) - assert_series_equal(df.groupby('A').size(), out) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c3400b6b710e5..bb892f92f213e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,3090 +5,1672 @@ from warnings import catch_warnings from datetime import datetime +from decimal import Decimal -from pandas import (date_range, bdate_range, Timestamp, +from pandas import (date_range, Timestamp, Index, MultiIndex, DataFrame, Series, - concat, Panel, DatetimeIndex, read_csv) -from pandas.core.dtypes.missing import isna -from pandas.errors import UnsupportedFunctionCall, PerformanceWarning -from pandas.util.testing import (assert_frame_equal, assert_index_equal, + Panel, DatetimeIndex, read_csv) +from pandas.errors import PerformanceWarning +from pandas.util.testing import (assert_frame_equal, assert_series_equal, assert_almost_equal) from pandas.compat import (range, lrange, StringIO, lmap, lzip, map, zip, - builtins, OrderedDict) + OrderedDict) from pandas import compat from collections import defaultdict import pandas.core.common as com import numpy as np -import pandas.core.nanops as nanops import pandas.util.testing as tm import pandas as pd -from .common import MixIn -class TestGrouper(object): +def test_repr(): + # GH18203 + result = repr(pd.Grouper(key='A', level='B')) + expected = "Grouper(key='A', level='B', axis=0, sort=False)" + assert result == expected - def test_repr(self): - # GH18203 - result = repr(pd.Grouper(key='A', level='B')) - expected = "Grouper(key='A', level='B', axis=0, sort=False)" - assert result == expected +@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32']) +def test_basic(dtype): -class TestGroupBy(MixIn): + data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) - def test_basic(self): - def checkit(dtype): - data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) - index = np.arange(9) - np.random.shuffle(index) - data = data.reindex(index) + grouped = data.groupby(lambda x: x // 3) - grouped = data.groupby(lambda x: x // 3) + for k, v in grouped: + assert len(v) == 3 - for k, v in grouped: - assert len(v) == 3 + agged = grouped.aggregate(np.mean) + assert agged[1] == 1 - agged = grouped.aggregate(np.mean) - assert agged[1] == 1 + assert_series_equal(agged, grouped.agg(np.mean)) # shorthand + assert_series_equal(agged, grouped.mean()) + assert_series_equal(grouped.agg(np.sum), grouped.sum()) - assert_series_equal(agged, grouped.agg(np.mean)) # shorthand - assert_series_equal(agged, grouped.mean()) - assert_series_equal(grouped.agg(np.sum), grouped.sum()) + expected = grouped.apply(lambda x: x * x.sum()) + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + assert_series_equal(transformed, expected) - expected = grouped.apply(lambda x: x * x.sum()) - transformed = grouped.transform(lambda x: x * x.sum()) - assert transformed[7] == 12 - assert_series_equal(transformed, expected) + value_grouped = data.groupby(data) + assert_series_equal(value_grouped.aggregate(np.mean), agged, + check_index_type=False) - value_grouped = data.groupby(data) - assert_series_equal(value_grouped.aggregate(np.mean), agged, - check_index_type=False) + # complex agg + agged = grouped.aggregate([np.mean, np.std]) - # complex agg - agged = grouped.aggregate([np.mean, np.std]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + agged = grouped.aggregate({'one': np.mean, 'two': np.std}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - agged = grouped.aggregate({'one': np.mean, 'two': np.std}) + group_constants = {0: 10, 1: 20, 2: 30} + agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) + assert agged[1] == 21 - group_constants = {0: 10, 1: 20, 2: 30} - agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) - assert agged[1] == 21 + # corner cases + pytest.raises(Exception, grouped.aggregate, lambda x: x * 2) - # corner cases - pytest.raises(Exception, grouped.aggregate, lambda x: x * 2) - for dtype in ['int64', 'int32', 'float64', 'float32']: - checkit(dtype) +def test_groupby_nonobject_dtype(mframe, df_mixed_floats): + key = mframe.index.labels[0] + grouped = mframe.groupby(key) + result = grouped.sum() - def test_groupby_nonobject_dtype(self): - key = self.mframe.index.labels[0] - grouped = self.mframe.groupby(key) - result = grouped.sum() + expected = mframe.groupby(key.astype('O')).sum() + assert_frame_equal(result, expected) - expected = self.mframe.groupby(key.astype('O')).sum() - assert_frame_equal(result, expected) + # GH 3911, mixed frame non-conversion + df = df_mixed_floats.copy() + df['value'] = lrange(len(df)) - # GH 3911, mixed frame non-conversion - df = self.df_mixed_floats.copy() - df['value'] = lrange(len(df)) + def max_value(group): + return group.loc[group['value'].idxmax()] - def max_value(group): - return group.loc[group['value'].idxmax()] + applied = df.groupby('A').apply(max_value) + result = applied.get_dtype_counts().sort_values() + expected = Series({'float64': 2, + 'int64': 1, + 'object': 2}).sort_values() + assert_series_equal(result, expected) - applied = df.groupby('A').apply(max_value) - result = applied.get_dtype_counts().sort_values() - expected = Series({'float64': 2, - 'int64': 1, - 'object': 2}).sort_values() - assert_series_equal(result, expected) - def test_groupby_return_type(self): +def test_groupby_return_type(): - # GH2893, return a reduced type - df1 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 2, "val2": 27}, - {"val1": 2, "val2": 12} - ]) + # GH2893, return a reduced type + df1 = DataFrame( + [{"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 2, "val2": 27}, + {"val1": 2, "val2": 12} + ]) - def func(dataf): - return dataf["val2"] - dataf["val2"].mean() + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() - result = df1.groupby("val1", squeeze=True).apply(func) - assert isinstance(result, Series) + result = df1.groupby("val1", squeeze=True).apply(func) + assert isinstance(result, Series) - df2 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 1, "val2": 27}, - {"val1": 1, "val2": 12} - ]) + df2 = DataFrame( + [{"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 1, "val2": 27}, + {"val1": 1, "val2": 12} + ]) - def func(dataf): - return dataf["val2"] - dataf["val2"].mean() + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + result = df2.groupby("val1", squeeze=True).apply(func) + assert isinstance(result, Series) - result = df2.groupby("val1", squeeze=True).apply(func) - assert isinstance(result, Series) + # GH3596, return a consistent type (regression in 0.11 from 0.10.1) + df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y']) + result = df.groupby('X', squeeze=False).count() + assert isinstance(result, DataFrame) + + # GH5592 + # inconcistent return type + df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', + 'Pony', 'Pony'], B=Series( + np.arange(7), dtype='int64'), C=date_range( + '20130101', periods=7))) + + def f(grp): + return grp.iloc[0] + + expected = df.groupby('A').first()[['B']] + result = df.groupby('A').apply(f)[['B']] + assert_frame_equal(result, expected) + + def f(grp): + if grp.name == 'Tiger': + return None + return grp.iloc[0] + + result = df.groupby('A').apply(f)[['B']] + e = expected.copy() + e.loc['Tiger'] = np.nan + assert_frame_equal(result, e) + + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0] + + result = df.groupby('A').apply(f)[['B']] + e = expected.copy() + e.loc['Pony'] = np.nan + assert_frame_equal(result, e) + + # 5592 revisited, with datetimes + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0] + + result = df.groupby('A').apply(f)[['C']] + e = df.groupby('A').first()[['C']] + e.loc['Pony'] = pd.NaT + assert_frame_equal(result, e) + + # scalar outputs + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0].loc['C'] + + result = df.groupby('A').apply(f) + e = df.groupby('A').first()['C'].copy() + e.loc['Pony'] = np.nan + e.name = None + assert_series_equal(result, e) - # GH3596, return a consistent type (regression in 0.11 from 0.10.1) - df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y']) - result = df.groupby('X', squeeze=False).count() - assert isinstance(result, DataFrame) - # GH5592 - # inconcistent return type - df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', - 'Pony', 'Pony'], B=Series( - np.arange(7), dtype='int64'), C=date_range( - '20130101', periods=7))) +def test_pass_args_kwargs(ts, tsframe): - def f(grp): - return grp.iloc[0] + def f(x, q=None, axis=0): + return np.percentile(x, q, axis=axis) - expected = df.groupby('A').first()[['B']] - result = df.groupby('A').apply(f)[['B']] - assert_frame_equal(result, expected) + g = lambda x: np.percentile(x, 80, axis=0) - def f(grp): - if grp.name == 'Tiger': - return None - return grp.iloc[0] + # Series + ts_grouped = ts.groupby(lambda x: x.month) + agg_result = ts_grouped.agg(np.percentile, 80, axis=0) + apply_result = ts_grouped.apply(np.percentile, 80, axis=0) + trans_result = ts_grouped.transform(np.percentile, 80, axis=0) + + agg_expected = ts_grouped.quantile(.8) + trans_expected = ts_grouped.transform(g) + + assert_series_equal(apply_result, agg_expected) + assert_series_equal(agg_result, agg_expected, check_names=False) + assert_series_equal(trans_result, trans_expected) + + agg_result = ts_grouped.agg(f, q=80) + apply_result = ts_grouped.apply(f, q=80) + trans_result = ts_grouped.transform(f, q=80) + assert_series_equal(agg_result, agg_expected) + assert_series_equal(apply_result, agg_expected) + assert_series_equal(trans_result, trans_expected) + + # DataFrame + df_grouped = tsframe.groupby(lambda x: x.month) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, .8) + expected = df_grouped.quantile(.8) + assert_frame_equal(apply_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=.8) + assert_frame_equal(agg_result, expected, check_names=False) + assert_frame_equal(apply_result, expected) + + +def test_len(): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]) + assert len(grouped) == len(df) - result = df.groupby('A').apply(f)[['B']] - e = expected.copy() - e.loc['Tiger'] = np.nan - assert_frame_equal(result, e) + grouped = df.groupby([lambda x: x.year, lambda x: x.month]) + expected = len({(x.year, x.month) for x in df.index}) + assert len(grouped) == expected - def f(grp): - if grp.name == 'Pony': - return None - return grp.iloc[0] + # issue 11016 + df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) + assert len(df.groupby(('a'))) == 0 + assert len(df.groupby(('b'))) == 3 + assert len(df.groupby(['a', 'b'])) == 3 + + +def test_basic_regression(): + # regression + T = [1.0 * x for x in lrange(1, 10) * 10][:1095] + result = Series(T, lrange(0, len(T))) - result = df.groupby('A').apply(f)[['B']] - e = expected.copy() - e.loc['Pony'] = np.nan - assert_frame_equal(result, e) + groupings = np.random.random((1100, )) + groupings = Series(groupings, lrange(0, len(groupings))) * 10. - # 5592 revisited, with datetimes - def f(grp): - if grp.name == 'Pony': - return None - return grp.iloc[0] + grouped = result.groupby(groupings) + grouped.mean() - result = df.groupby('A').apply(f)[['C']] - e = df.groupby('A').first()[['C']] - e.loc['Pony'] = pd.NaT - assert_frame_equal(result, e) - # scalar outputs - def f(grp): - if grp.name == 'Pony': - return None - return grp.iloc[0].loc['C'] - - result = df.groupby('A').apply(f) - e = df.groupby('A').first()['C'].copy() - e.loc['Pony'] = np.nan - e.name = None - assert_series_equal(result, e) - - def test_apply_issues(self): - # GH 5788 - - s = """2011.05.16,00:00,1.40893 -2011.05.16,01:00,1.40760 -2011.05.16,02:00,1.40750 -2011.05.16,03:00,1.40649 -2011.05.17,02:00,1.40893 -2011.05.17,03:00,1.40760 -2011.05.17,04:00,1.40750 -2011.05.17,05:00,1.40649 -2011.05.18,02:00,1.40893 -2011.05.18,03:00,1.40760 -2011.05.18,04:00,1.40750 -2011.05.18,05:00,1.40649""" - - df = pd.read_csv( - StringIO(s), header=None, names=['date', 'time', 'value'], - parse_dates=[['date', 'time']]) - df = df.set_index('date_time') - - expected = df.groupby(df.index.date).idxmax() - result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) - assert_frame_equal(result, expected) - - # GH 5789 - # don't auto coerce dates - df = pd.read_csv( - StringIO(s), header=None, names=['date', 'time', 'value']) - exp_idx = pd.Index( - ['2011.05.16', '2011.05.17', '2011.05.18' - ], dtype=object, name='date') - expected = Series(['00:00', '02:00', '02:00'], index=exp_idx) - result = df.groupby('date').apply( - lambda x: x['time'][x['value'].idxmax()]) - assert_series_equal(result, expected) - - def test_apply_trivial(self): - # GH 20066 - # trivial apply: ignore input and return a constant dataframe. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df.iloc[1:]) - - assert_frame_equal(result, expected) - - @pytest.mark.xfail(reason=("GH 20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object.")) - def test_apply_trivial_fail(self): - # GH 20066 - # trivial apply fails if the constant dataframe has the same index - # with the one used to create GroupBy object. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df, df], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df) - - assert_frame_equal(result, expected) - - def test_time_field_bug(self): - # Test a fix for the following error related to GH issue 11324 When - # non-key fields in a group-by dataframe contained time-based fields - # that were not returned by the apply function, an exception would be - # raised. - - df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]}) - - def func_with_no_date(batch): - return pd.Series({'c': 2}) - - def func_with_date(batch): - return pd.Series({'b': datetime(2015, 1, 1), 'c': 2}) - - dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date) - dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1]) - dfg_no_conversion_expected.index.name = 'a' - - dfg_conversion = df.groupby(by=['a']).apply(func_with_date) - dfg_conversion_expected = pd.DataFrame( - {'b': datetime(2015, 1, 1), - 'c': 2}, index=[1]) - dfg_conversion_expected.index.name = 'a' - - tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) - tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected) - - def test_len(self): - df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) - assert len(grouped) == len(df) - - grouped = df.groupby([lambda x: x.year, lambda x: x.month]) - expected = len({(x.year, x.month) for x in df.index}) - assert len(grouped) == expected - - # issue 11016 - df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) - assert len(df.groupby(('a'))) == 0 - assert len(df.groupby(('b'))) == 3 - assert len(df.groupby(['a', 'b'])) == 3 - - def test_basic_regression(self): - # regression - T = [1.0 * x for x in lrange(1, 10) * 10][:1095] - result = Series(T, lrange(0, len(T))) - - groupings = np.random.random((1100, )) - groupings = Series(groupings, lrange(0, len(groupings))) * 10. - - grouped = result.groupby(groupings) - grouped.mean() - - def test_with_na_groups(self): - index = Index(np.arange(10)) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']: - values = Series(np.ones(10), index, dtype=dtype) - labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, - 'bar', 'bar', np.nan, 'foo'], index=index) - - # this SHOULD be an int - grouped = values.groupby(labels) - agged = grouped.agg(len) - expected = Series([4, 2], index=['bar', 'foo']) - - assert_series_equal(agged, expected, check_dtype=False) - - # assert issubclass(agged.dtype.type, np.integer) - - # explicitly return a float from my function - def f(x): - return float(len(x)) - - agged = grouped.agg(f) - expected = Series([4, 2], index=['bar', 'foo']) - - assert_series_equal(agged, expected, check_dtype=False) - assert issubclass(agged.dtype.type, np.dtype(dtype).type) - - def test_indices_concatenation_order(self): - - # GH 2808 - - def f1(x): - y = x[(x.b % 2) == 1] ** 2 - if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, - names=['b', 'c']) - res = DataFrame(None, columns=['a'], index=multiindex) - return res - else: - y = y.set_index(['b', 'c']) - return y - - def f2(x): - y = x[(x.b % 2) == 1] ** 2 - if y.empty: - return DataFrame() - else: - y = y.set_index(['b', 'c']) - return y - - def f3(x): - y = x[(x.b % 2) == 1] ** 2 - if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, - names=['foo', 'bar']) - res = DataFrame(None, columns=['a', 'b'], index=multiindex) - return res - else: - return y - - df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) - - df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) - - # correct result - result1 = df.groupby('a').apply(f1) - result2 = df2.groupby('a').apply(f1) - assert_frame_equal(result1, result2) - - # should fail (not the same number of levels) - pytest.raises(AssertionError, df.groupby('a').apply, f2) - pytest.raises(AssertionError, df2.groupby('a').apply, f2) - - # should fail (incorrect shape) - pytest.raises(AssertionError, df.groupby('a').apply, f3) - pytest.raises(AssertionError, df2.groupby('a').apply, f3) - - def test_attr_wrapper(self): - grouped = self.ts.groupby(lambda x: x.weekday()) - - result = grouped.std() - expected = grouped.agg(lambda x: np.std(x, ddof=1)) - assert_series_equal(result, expected) - - # this is pretty cool - result = grouped.describe() - expected = {} - for name, gp in grouped: - expected[name] = gp.describe() - expected = DataFrame(expected).T - assert_frame_equal(result, expected) - - # get attribute - result = grouped.dtype - expected = grouped.agg(lambda x: x.dtype) - - # make sure raises error - pytest.raises(AttributeError, getattr, grouped, 'foo') - - def test_frame_groupby(self): - grouped = self.tsframe.groupby(lambda x: x.weekday()) - - # aggregate - aggregated = grouped.aggregate(np.mean) - assert len(aggregated) == 5 - assert len(aggregated.columns) == 4 - - # by string - tscopy = self.tsframe.copy() - tscopy['weekday'] = [x.weekday() for x in tscopy.index] - stragged = tscopy.groupby('weekday').aggregate(np.mean) - assert_frame_equal(stragged, aggregated, check_names=False) - - # transform - grouped = self.tsframe.head(30).groupby(lambda x: x.weekday()) - transformed = grouped.transform(lambda x: x - x.mean()) - assert len(transformed) == 30 - assert len(transformed.columns) == 4 - - # transform propagate - transformed = grouped.transform(lambda x: x.mean()) - for name, group in grouped: - mean = group.mean() - for idx in group.index: - tm.assert_series_equal(transformed.xs(idx), mean, - check_names=False) - - # iterate - for weekday, group in grouped: - assert group.index[0].weekday() == weekday - - # groups / group_indices - groups = grouped.groups - indices = grouped.indices - - for k, v in compat.iteritems(groups): - samething = self.tsframe.index.take(indices[k]) - assert (samething == v).all() - - def test_frame_groupby_columns(self): - mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} - grouped = self.tsframe.groupby(mapping, axis=1) - - # aggregate - aggregated = grouped.aggregate(np.mean) - assert len(aggregated) == len(self.tsframe) - assert len(aggregated.columns) == 2 - - # transform - tf = lambda x: x - x.mean() - groupedT = self.tsframe.T.groupby(mapping, axis=0) - assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) - - # iterate - for k, v in grouped: - assert len(v.columns) == 2 - - def test_frame_set_name_single(self): - grouped = self.df.groupby('A') - - result = grouped.mean() - assert result.index.name == 'A' - - result = self.df.groupby('A', as_index=False).mean() - assert result.index.name != 'A' - - result = grouped.agg(np.mean) - assert result.index.name == 'A' - - result = grouped.agg({'C': np.mean, 'D': np.std}) - assert result.index.name == 'A' - - result = grouped['C'].mean() - assert result.index.name == 'A' - result = grouped['C'].agg(np.mean) - assert result.index.name == 'A' - result = grouped['C'].agg([np.mean, np.std]) - assert result.index.name == 'A' - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) - assert result.index.name == 'A' - - def test_multi_func(self): - col1 = self.df['A'] - col2 = self.df['B'] - - grouped = self.df.groupby([col1.get, col2.get]) - agged = grouped.mean() - expected = self.df.groupby(['A', 'B']).mean() - - # TODO groupby get drops names - assert_frame_equal(agged.loc[:, ['C', 'D']], - expected.loc[:, ['C', 'D']], - check_names=False) - - # some "groups" with no data - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, - index=['one', 'two', 'three', 'four', 'five', 'six']) - # only verify that it works for now - grouped = df.groupby(['k1', 'k2']) - grouped.agg(np.sum) - - def test_multi_key_multiple_functions(self): - grouped = self.df.groupby(['A', 'B'])['C'] - - agged = grouped.agg([np.mean, np.std]) - expected = DataFrame({'mean': grouped.agg(np.mean), - 'std': grouped.agg(np.std)}) - assert_frame_equal(agged, expected) - - def test_frame_multi_key_function_list(self): - data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - grouped = data.groupby(['A', 'B']) - funcs = [np.mean, np.std] - agged = grouped.agg(funcs) - expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), - grouped['F'].agg(funcs)], - keys=['D', 'E', 'F'], axis=1) - assert (isinstance(agged.index, MultiIndex)) - assert (isinstance(expected.index, MultiIndex)) - assert_frame_equal(agged, expected) - - def test_groupby_multiple_columns(self): - data = self.df - grouped = data.groupby(['A', 'B']) - - def _check_op(op): - - with catch_warnings(record=True): - result1 = op(grouped) - - expected = defaultdict(dict) - for n1, gp1 in data.groupby('A'): - for n2, gp2 in gp1.groupby('B'): - expected[n1][n2] = op(gp2.loc[:, ['C', 'D']]) - expected = dict((k, DataFrame(v)) - for k, v in compat.iteritems(expected)) - expected = Panel.fromDict(expected).swapaxes(0, 1) - expected.major_axis.name, expected.minor_axis.name = 'A', 'B' - - # a little bit crude - for col in ['C', 'D']: - result_col = op(grouped[col]) - exp = expected[col] - pivoted = result1[col].unstack() - pivoted2 = result_col.unstack() - assert_frame_equal(pivoted.reindex_like(exp), exp) - assert_frame_equal(pivoted2.reindex_like(exp), exp) - - _check_op(lambda x: x.sum()) - _check_op(lambda x: x.mean()) - - # test single series works the same - result = data['C'].groupby([data['A'], data['B']]).mean() - expected = data.groupby(['A', 'B']).mean()['C'] - - assert_series_equal(result, expected) - - def test_groupby_as_index_agg(self): - grouped = self.df.groupby('A', as_index=False) - - # single-key - - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) - expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] - assert_frame_equal(result2, expected2) - - grouped = self.df.groupby('A', as_index=True) - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result3 = grouped['C'].agg({'Q': np.sum}) - assert_frame_equal(result3, expected3) - - # multi-key - - grouped = self.df.groupby(['A', 'B'], as_index=False) - - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) - expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] - assert_frame_equal(result2, expected2) - - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) +@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64', + 'int32', 'int16', 'int8']) +def test_with_na_groups(dtype): + index = Index(np.arange(10)) + values = Series(np.ones(10), index, dtype=dtype) + labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, + 'bar', 'bar', np.nan, 'foo'], index=index) + + # this SHOULD be an int + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + + # assert issubclass(agged.dtype.type, np.integer) + + # explicitly return a float from my function + def f(x): + return float(len(x)) + + agged = grouped.agg(f) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + assert issubclass(agged.dtype.type, np.dtype(dtype).type) + + +def test_indices_concatenation_order(): + + # GH 2808 + + def f1(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, + names=['b', 'c']) + res = DataFrame(None, columns=['a'], index=multiindex) + return res + else: + y = y.set_index(['b', 'c']) + return y + + def f2(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + return DataFrame() + else: + y = y.set_index(['b', 'c']) + return y + + def f3(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, + names=['foo', 'bar']) + res = DataFrame(None, columns=['a', 'b'], index=multiindex) + return res + else: + return y + + df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) + + df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) + + # correct result + result1 = df.groupby('a').apply(f1) + result2 = df2.groupby('a').apply(f1) + assert_frame_equal(result1, result2) + + # should fail (not the same number of levels) + pytest.raises(AssertionError, df.groupby('a').apply, f2) + pytest.raises(AssertionError, df2.groupby('a').apply, f2) + + # should fail (incorrect shape) + pytest.raises(AssertionError, df.groupby('a').apply, f3) + pytest.raises(AssertionError, df2.groupby('a').apply, f3) + + +def test_attr_wrapper(ts): + grouped = ts.groupby(lambda x: x.weekday()) + + result = grouped.std() + expected = grouped.agg(lambda x: np.std(x, ddof=1)) + assert_series_equal(result, expected) + + # this is pretty cool + result = grouped.describe() + expected = {} + for name, gp in grouped: + expected[name] = gp.describe() + expected = DataFrame(expected).T + assert_frame_equal(result, expected) + + # get attribute + result = grouped.dtype + expected = grouped.agg(lambda x: x.dtype) + + # make sure raises error + pytest.raises(AttributeError, getattr, grouped, 'foo') + + +def test_frame_groupby(tsframe): + grouped = tsframe.groupby(lambda x: x.weekday()) + + # aggregate + aggregated = grouped.aggregate(np.mean) + assert len(aggregated) == 5 + assert len(aggregated.columns) == 4 + + # by string + tscopy = tsframe.copy() + tscopy['weekday'] = [x.weekday() for x in tscopy.index] + stragged = tscopy.groupby('weekday').aggregate(np.mean) + assert_frame_equal(stragged, aggregated, check_names=False) + + # transform + grouped = tsframe.head(30).groupby(lambda x: x.weekday()) + transformed = grouped.transform(lambda x: x - x.mean()) + assert len(transformed) == 30 + assert len(transformed.columns) == 4 + + # transform propagate + transformed = grouped.transform(lambda x: x.mean()) + for name, group in grouped: + mean = group.mean() + for idx in group.index: + tm.assert_series_equal(transformed.xs(idx), mean, + check_names=False) + + # iterate + for weekday, group in grouped: + assert group.index[0].weekday() == weekday + + # groups / group_indices + groups = grouped.groups + indices = grouped.indices + + for k, v in compat.iteritems(groups): + samething = tsframe.index.take(indices[k]) + assert (samething == v).all() + + +def test_frame_groupby_columns(tsframe): + mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} + grouped = tsframe.groupby(mapping, axis=1) + + # aggregate + aggregated = grouped.aggregate(np.mean) + assert len(aggregated) == len(tsframe) + assert len(aggregated.columns) == 2 + + # transform + tf = lambda x: x - x.mean() + groupedT = tsframe.T.groupby(mapping, axis=0) + assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) + + # iterate + for k, v in grouped: + assert len(v.columns) == 2 + + +def test_frame_set_name_single(df): + grouped = df.groupby('A') + + result = grouped.mean() + assert result.index.name == 'A' + + result = df.groupby('A', as_index=False).mean() + assert result.index.name != 'A' + + result = grouped.agg(np.mean) + assert result.index.name == 'A' + + result = grouped.agg({'C': np.mean, 'D': np.std}) + assert result.index.name == 'A' + + result = grouped['C'].mean() + assert result.index.name == 'A' + result = grouped['C'].agg(np.mean) + assert result.index.name == 'A' + result = grouped['C'].agg([np.mean, np.std]) + assert result.index.name == 'A' + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) + assert result.index.name == 'A' + + +def test_multi_func(df): + col1 = df['A'] + col2 = df['B'] + + grouped = df.groupby([col1.get, col2.get]) + agged = grouped.mean() + expected = df.groupby(['A', 'B']).mean() + + # TODO groupby get drops names + assert_frame_equal(agged.loc[:, ['C', 'D']], + expected.loc[:, ['C', 'D']], + check_names=False) + + # some "groups" with no data + df = DataFrame({'v1': np.random.randn(6), + 'v2': np.random.randn(6), + 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + # only verify that it works for now + grouped = df.groupby(['k1', 'k2']) + grouped.agg(np.sum) + + +def test_multi_key_multiple_functions(df): + grouped = df.groupby(['A', 'B'])['C'] + + agged = grouped.agg([np.mean, np.std]) + expected = DataFrame({'mean': grouped.agg(np.mean), + 'std': grouped.agg(np.std)}) + assert_frame_equal(agged, expected) + + +def test_frame_multi_key_function_list(): + data = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + grouped = data.groupby(['A', 'B']) + funcs = [np.mean, np.std] + agged = grouped.agg(funcs) + expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), + grouped['F'].agg(funcs)], + keys=['D', 'E', 'F'], axis=1) + assert (isinstance(agged.index, MultiIndex)) + assert (isinstance(expected.index, MultiIndex)) + assert_frame_equal(agged, expected) + + +@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()]) +def test_groupby_multiple_columns(df, op): + data = df + grouped = data.groupby(['A', 'B']) + + with catch_warnings(record=True): + result1 = op(grouped) + + expected = defaultdict(dict) + for n1, gp1 in data.groupby('A'): + for n2, gp2 in gp1.groupby('B'): + expected[n1][n2] = op(gp2.loc[:, ['C', 'D']]) + expected = dict((k, DataFrame(v)) + for k, v in compat.iteritems(expected)) + expected = Panel.fromDict(expected).swapaxes(0, 1) + expected.major_axis.name, expected.minor_axis.name = 'A', 'B' + + # a little bit crude + for col in ['C', 'D']: + result_col = op(grouped[col]) + exp = expected[col] + pivoted = result1[col].unstack() + pivoted2 = result_col.unstack() + assert_frame_equal(pivoted.reindex_like(exp), exp) + assert_frame_equal(pivoted2.reindex_like(exp), exp) + + # test single series works the same + result = data['C'].groupby([data['A'], data['B']]).mean() + expected = data.groupby(['A', 'B']).mean()['C'] + + assert_series_equal(result, expected) + + +def test_groupby_as_index_agg(df): + grouped = df.groupby('A', as_index=False) + + # single-key + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + grouped = df.groupby('A', as_index=True) + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): result3 = grouped['C'].agg({'Q': np.sum}) - assert_frame_equal(result3, expected3) - - # GH7115 & GH8112 & GH8582 - df = DataFrame(np.random.randint(0, 100, (50, 3)), - columns=['jim', 'joe', 'jolie']) - ts = Series(np.random.randint(5, 10, 50), name='jim') - - gr = df.groupby(ts) - gr.nth(0) # invokes set_selection_from_grouper internally - assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) - - for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: - gr = df.groupby(ts, as_index=False) - left = getattr(gr, attr)() - - gr = df.groupby(ts.values, as_index=True) - right = getattr(gr, attr)().reset_index(drop=True) - - assert_frame_equal(left, right) - - def test_as_index_series_return_frame(self): - grouped = self.df.groupby('A', as_index=False) - grouped2 = self.df.groupby(['A', 'B'], as_index=False) - - result = grouped['C'].agg(np.sum) - expected = grouped.agg(np.sum).loc[:, ['A', 'C']] - assert isinstance(result, DataFrame) - assert_frame_equal(result, expected) - - result2 = grouped2['C'].agg(np.sum) - expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] - assert isinstance(result2, DataFrame) - assert_frame_equal(result2, expected2) - - result = grouped['C'].sum() - expected = grouped.sum().loc[:, ['A', 'C']] - assert isinstance(result, DataFrame) - assert_frame_equal(result, expected) - - result2 = grouped2['C'].sum() - expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] - assert isinstance(result2, DataFrame) - assert_frame_equal(result2, expected2) - - # corner case - pytest.raises(Exception, grouped['C'].__getitem__, 'D') - - def test_groupby_as_index_cython(self): - data = self.df - - # single-key - grouped = data.groupby('A', as_index=False) - result = grouped.mean() - expected = data.groupby(['A']).mean() - expected.insert(0, 'A', expected.index) - expected.index = np.arange(len(expected)) - assert_frame_equal(result, expected) - - # multi-key - grouped = data.groupby(['A', 'B'], as_index=False) - result = grouped.mean() - expected = data.groupby(['A', 'B']).mean() - - arrays = lzip(*expected.index.values) - expected.insert(0, 'A', arrays[0]) - expected.insert(1, 'B', arrays[1]) - expected.index = np.arange(len(expected)) - assert_frame_equal(result, expected) - - def test_groupby_as_index_series_scalar(self): - grouped = self.df.groupby(['A', 'B'], as_index=False) - - # GH #421 - - result = grouped['C'].agg(len) - expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] - assert_frame_equal(result, expected) - - def test_groupby_as_index_corner(self): - pytest.raises(TypeError, self.ts.groupby, lambda x: x.weekday(), - as_index=False) - - pytest.raises(ValueError, self.df.groupby, lambda x: x.lower(), - as_index=False, axis=1) - - def test_groupby_as_index_apply(self): - # GH #4648 and #3417 - df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], - 'user_id': [1, 2, 1, 1, 3, 1], - 'time': range(6)}) - - g_as = df.groupby('user_id', as_index=True) - g_not_as = df.groupby('user_id', as_index=False) - - res_as = g_as.head(2).index - res_not_as = g_not_as.head(2).index - exp = Index([0, 1, 2, 4]) - assert_index_equal(res_as, exp) - assert_index_equal(res_not_as, exp) - - res_as_apply = g_as.apply(lambda x: x.head(2)).index - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index - - # apply doesn't maintain the original ordering - # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( - 2, 4)]) - tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) - - assert_index_equal(res_as_apply, exp_as_apply) - assert_index_equal(res_not_as_apply, exp_not_as_apply) - - ind = Index(list('abcde')) - df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False).apply(lambda x: x).index - assert_index_equal(res, ind) - - def test_groupby_multiple_key(self): - df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) - agged = grouped.sum() - assert_almost_equal(df.values, agged.values) - - grouped = df.T.groupby([lambda x: x.year, - lambda x: x.month, - lambda x: x.day], axis=1) - - agged = grouped.agg(lambda x: x.sum()) - tm.assert_index_equal(agged.index, df.columns) - assert_almost_equal(df.T.values, agged.values) - - agged = grouped.agg(lambda x: x.sum()) - assert_almost_equal(df.T.values, agged.values) - - def test_groupby_multi_corner(self): - # test that having an all-NA column doesn't mess you up - df = self.df.copy() - df['bad'] = np.nan - agged = df.groupby(['A', 'B']).mean() - - expected = self.df.groupby(['A', 'B']).mean() - expected['bad'] = np.nan - - assert_frame_equal(agged, expected) - - def test_omit_nuisance(self): - grouped = self.df.groupby('A') - - result = grouped.mean() - expected = self.df.loc[:, ['A', 'C', 'D']].groupby('A').mean() - assert_frame_equal(result, expected) - - agged = grouped.agg(np.mean) - exp = grouped.mean() - assert_frame_equal(agged, exp) - - df = self.df.loc[:, ['A', 'C', 'D']] - df['E'] = datetime.now() - grouped = df.groupby('A') - result = grouped.agg(np.sum) - expected = grouped.sum() - assert_frame_equal(result, expected) - - # won't work with axis = 1 - grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) - result = pytest.raises(TypeError, grouped.agg, - lambda x: x.sum(0, numeric_only=False)) - - def test_omit_nuisance_python_multiple(self): - grouped = self.three_group.groupby(['A', 'B']) - - agged = grouped.agg(np.mean) - exp = grouped.mean() - assert_frame_equal(agged, exp) - - def test_empty_groups_corner(self): - # handle empty groups - df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2']), - 'k3': ['foo', 'bar'] * 3, - 'v1': np.random.randn(6), - 'v2': np.random.randn(6)}) - - grouped = df.groupby(['k1', 'k2']) - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - grouped = self.mframe[3:5].groupby(level=0) - agged = grouped.apply(lambda x: x.mean()) - agged_A = grouped['A'].apply(np.mean) - assert_series_equal(agged['A'], agged_A) - assert agged.index.name == 'first' - - def test_apply_concat_preserve_names(self): - grouped = self.three_group.groupby(['A', 'B']) - - def desc(group): - result = group.describe() - result.index.name = 'stat' - return result - - def desc2(group): - result = group.describe() - result.index.name = 'stat' - result = result[:len(group)] - # weirdo - return result - - def desc3(group): - result = group.describe() - - # names are different - result.index.name = 'stat_%d' % len(group) - - result = result[:len(group)] - # weirdo - return result - - result = grouped.apply(desc) - assert result.index.names == ('A', 'B', 'stat') - - result2 = grouped.apply(desc2) - assert result2.index.names == ('A', 'B', 'stat') - - result3 = grouped.apply(desc3) - assert result3.index.names == ('A', 'B', None) - - def test_nonsense_func(self): - df = DataFrame([0]) - pytest.raises(Exception, df.groupby, lambda x: x + 'foo') - - def test_builtins_apply(self): # GH8155 - df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), - columns=['jim', 'joe']) - df['jolie'] = np.random.randn(1000) - - for keys in ['jim', ['jim', 'joe']]: # single key & multi-key - if keys == 'jim': - continue - for f in [max, min, sum]: - fname = f.__name__ - result = df.groupby(keys).apply(f) - result.shape - ngroups = len(df.drop_duplicates(subset=keys)) - assert result.shape == (ngroups, 3), 'invalid frame shape: '\ - '{} (expected ({}, 3))'.format(result.shape, ngroups) - - assert_frame_equal(result, # numpy's equivalent function - df.groupby(keys).apply(getattr(np, fname))) - - if f != sum: - expected = df.groupby(keys).agg(fname).reset_index() - expected.set_index(keys, inplace=True, drop=False) - assert_frame_equal(result, expected, check_dtype=False) - - assert_series_equal(getattr(result, fname)(), - getattr(df, fname)()) - - def test_max_min_non_numeric(self): - # #2700 - aa = DataFrame({'nn': [11, 11, 22, 22], - 'ii': [1, 2, 3, 4], - 'ss': 4 * ['mama']}) - - result = aa.groupby('nn').max() - assert 'ss' in result - - result = aa.groupby('nn').max(numeric_only=False) - assert 'ss' in result - - result = aa.groupby('nn').min() - assert 'ss' in result - - result = aa.groupby('nn').min(numeric_only=False) - assert 'ss' in result - - def test_arg_passthru(self): - # make sure that we are passing thru kwargs - # to our agg functions - - # GH3668 - # GH5724 - df = pd.DataFrame( - {'group': [1, 1, 2], - 'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'category_string': pd.Series(list('abc')).astype('category'), - 'category_int': [7, 8, 9], - 'datetime': pd.date_range('20130101', periods=3), - 'datetimetz': pd.date_range('20130101', - periods=3, - tz='US/Eastern'), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, - columns=['group', 'int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - - expected_columns_numeric = Index(['int', 'float', 'category_int']) - - # mean / median - expected = pd.DataFrame( - {'category_int': [7.5, 9], - 'float': [4.5, 6.], - 'timedelta': [pd.Timedelta('1.5s'), - pd.Timedelta('3s')], - 'int': [1.5, 3], - 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), - pd.Timestamp('2013-01-03 00:00:00')], - 'datetimetz': [ - pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), - pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, - index=Index([1, 2], name='group'), - columns=['int', 'float', 'category_int', - 'datetime', 'datetimetz', 'timedelta']) - for attr in ['mean', 'median']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - assert_frame_equal(result.reindex_like(expected), expected) - - # TODO: min, max *should* handle - # categorical (ordered) dtype - expected_columns = Index(['int', 'float', 'string', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['min', 'max']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['first', 'last']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'string', - 'category_int', 'timedelta']) - for attr in ['sum']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'category_int']) - for attr in ['prod', 'cumprod']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - # like min, max, but don't include strings - expected_columns = Index(['int', 'float', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['cummin', 'cummax']: - f = getattr(df.groupby('group'), attr) - result = f() - # GH 15561: numeric_only=False set by default like min/max - tm.assert_index_equal(result.columns, expected_columns) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'category_int', - 'timedelta']) - for attr in ['cumsum']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - def test_wrap_aggregated_output_multindex(self): - df = self.mframe.T - df['baz', 'two'] = 'peekaboo' - - keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - agged = df.groupby(keys).agg(np.mean) - assert isinstance(agged.columns, MultiIndex) - - def aggfun(ser): - if ser.name == ('foo', 'one'): - raise TypeError - else: - return ser.sum() - - agged2 = df.groupby(keys).aggregate(aggfun) - assert len(agged2.columns) + 1 == len(df.columns) - - def test_groupby_level_apply(self): - frame = self.mframe - - result = frame.groupby(level=0).count() - assert result.index.name == 'first' - result = frame.groupby(level=1).count() - assert result.index.name == 'second' - - result = frame['A'].groupby(level=0).count() - assert result.index.name == 'first' - - def test_groupby_level_mapper(self): - frame = self.mframe - deleveled = frame.reset_index() - - mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1} - mapper1 = {'one': 0, 'two': 0, 'three': 1} - - result0 = frame.groupby(mapper0, level=0).sum() - result1 = frame.groupby(mapper1, level=1).sum() - - mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) - mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) - expected0 = frame.groupby(mapped_level0).sum() - expected1 = frame.groupby(mapped_level1).sum() - expected0.index.name, expected1.index.name = 'first', 'second' - - assert_frame_equal(result0, expected0) - assert_frame_equal(result1, expected1) - - def test_groupby_level_nonmulti(self): - # GH 1313, GH 13901 - s = Series([1, 2, 3, 10, 4, 5, 20, 6], - Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo')) - expected = Series([11, 22, 3, 4, 5, 6], - Index(range(1, 7), name='foo')) - - result = s.groupby(level=0).sum() - tm.assert_series_equal(result, expected) - result = s.groupby(level=[0]).sum() - tm.assert_series_equal(result, expected) - result = s.groupby(level=-1).sum() - tm.assert_series_equal(result, expected) - result = s.groupby(level=[-1]).sum() - tm.assert_series_equal(result, expected) - - pytest.raises(ValueError, s.groupby, level=1) - pytest.raises(ValueError, s.groupby, level=-2) - pytest.raises(ValueError, s.groupby, level=[]) - pytest.raises(ValueError, s.groupby, level=[0, 0]) - pytest.raises(ValueError, s.groupby, level=[0, 1]) - pytest.raises(ValueError, s.groupby, level=[1]) - - def test_groupby_complex(self): - # GH 12902 - a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) - expected = Series((1 + 2j, 5 + 10j)) - - result = a.groupby(level=0).sum() - assert_series_equal(result, expected) - - result = a.sum(level=0) - assert_series_equal(result, expected) - - def test_apply_series_to_frame(self): - def f(piece): - with np.errstate(invalid='ignore'): - logged = np.log(piece) - return DataFrame({'value': piece, - 'demeaned': piece - piece.mean(), - 'logged': logged}) - - dr = bdate_range('1/1/2000', periods=100) - ts = Series(np.random.randn(100), index=dr) - - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(f) - - assert isinstance(result, DataFrame) - tm.assert_index_equal(result.index, ts.index) - - def test_apply_series_yield_constant(self): - result = self.df.groupby(['A', 'B'])['C'].apply(len) - assert result.index.names[:2] == ('A', 'B') - - def test_apply_frame_yield_constant(self): - # GH13568 - result = self.df.groupby(['A', 'B']).apply(len) - assert isinstance(result, Series) - assert result.name is None - - result = self.df.groupby(['A', 'B'])[['C', 'D']].apply(len) - assert isinstance(result, Series) - assert result.name is None - - def test_apply_frame_to_series(self): - grouped = self.df.groupby(['A', 'B']) - result = grouped.apply(len) - expected = grouped.count()['C'] - tm.assert_index_equal(result.index, expected.index) - tm.assert_numpy_array_equal(result.values, expected.values) - - def test_apply_frame_concat_series(self): - def trans(group): - return group.groupby('B')['C'].sum().sort_values()[:2] - - def trans2(group): - grouped = group.groupby(df.reindex(group.index)['B']) - return grouped.sum().sort_values()[:2] - - df = DataFrame({'A': np.random.randint(0, 5, 1000), - 'B': np.random.randint(0, 5, 1000), - 'C': np.random.randn(1000)}) - - result = df.groupby('A').apply(trans) - exp = df.groupby('A')['C'].apply(trans2) - assert_series_equal(result, exp, check_names=False) - assert result.name == 'C' - - def test_apply_transform(self): - grouped = self.ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x * 2) - expected = grouped.transform(lambda x: x * 2) - assert_series_equal(result, expected) - - def test_apply_multikey_corner(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - - def f(group): - return group.sort_values('A')[-5:] - - result = grouped.apply(f) - for key, group in grouped: - assert_frame_equal(result.loc[key], f(group)) - - def test_mutate_groups(self): - - # GH3380 - - mydf = DataFrame({ - 'cat1': ['a'] * 8 + ['b'] * 6, - 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + - ['d'] * 2 + ['e'] * 2, - 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)), - 'val': np.random.randint(100, size=14), - }) - - def f_copy(x): - x = x.copy() - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() - - def f_no_copy(x): - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() - - grpby_copy = mydf.groupby('cat1').apply(f_copy) - grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy) - assert_series_equal(grpby_copy, grpby_no_copy) - - def test_no_mutate_but_looks_like(self): - - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)}) - - result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) - assert_series_equal(result1, result2) - - def test_apply_chunk_view(self): - # Low level tinkering could be unsafe, make sure not - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'value': lrange(9)}) - - # return view - f = lambda x: x[:2] - - result = df.groupby('key', group_keys=False).apply(f) - expected = df.take([0, 1, 3, 4, 6, 7]) - assert_frame_equal(result, expected) - - def test_apply_no_name_column_conflict(self): - df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], - 'value': lrange(10)[::-1]}) - - # it works! #2605 - grouped = df.groupby(['name', 'name2']) - grouped.apply(lambda x: x.sort_values('value', inplace=True)) - - def test_groupby_series_indexed_differently(self): - s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], - index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) - s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], - index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) - - grouped = s1.groupby(s2) - agged = grouped.mean() - exp = s1.groupby(s2.reindex(s1.index).get).mean() - assert_series_equal(agged, exp) - - def test_groupby_with_hier_columns(self): - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', - 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', - 'one', 'two']])) - index = MultiIndex.from_tuples(tuples) - columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), ( - 'B', 'cat'), ('A', 'dog')]) - df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) - - result = df.groupby(level=0).mean() - tm.assert_index_equal(result.columns, columns) - - result = df.groupby(level=0, axis=1).mean() - tm.assert_index_equal(result.index, df.index) - - result = df.groupby(level=0).agg(np.mean) - tm.assert_index_equal(result.columns, columns) - - result = df.groupby(level=0).apply(lambda x: x.mean()) - tm.assert_index_equal(result.columns, columns) - - result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) - tm.assert_index_equal(result.columns, Index(['A', 'B'])) - tm.assert_index_equal(result.index, df.index) - - # add a nuisance column - sorted_columns, _ = columns.sortlevel(0) - df['A', 'foo'] = 'bar' - result = df.groupby(level=0).mean() - tm.assert_index_equal(result.columns, df.columns[:-1]) - - def test_pass_args_kwargs(self): - from numpy import percentile - - def f(x, q=None, axis=0): - return percentile(x, q, axis=axis) - - g = lambda x: percentile(x, 80, axis=0) - - # Series - ts_grouped = self.ts.groupby(lambda x: x.month) - agg_result = ts_grouped.agg(percentile, 80, axis=0) - apply_result = ts_grouped.apply(percentile, 80, axis=0) - trans_result = ts_grouped.transform(percentile, 80, axis=0) - - agg_expected = ts_grouped.quantile(.8) - trans_expected = ts_grouped.transform(g) - - assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected, check_names=False) - assert_series_equal(trans_result, trans_expected) - - agg_result = ts_grouped.agg(f, q=80) - apply_result = ts_grouped.apply(f, q=80) - trans_result = ts_grouped.transform(f, q=80) - assert_series_equal(agg_result, agg_expected) - assert_series_equal(apply_result, agg_expected) - assert_series_equal(trans_result, trans_expected) - - # DataFrame - df_grouped = self.tsframe.groupby(lambda x: x.month) - agg_result = df_grouped.agg(percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, .8) - expected = df_grouped.quantile(.8) - assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected, check_names=False) - - agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected, check_names=False) - assert_frame_equal(apply_result, expected) - - def test_non_cython_api(self): - - # GH5610 - # non-cython calls should not include the grouper - - df = DataFrame( - [[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, 'baz']], - columns=['A', 'B', 'C']) - g = df.groupby('A') - gni = df.groupby('A', as_index=False) - - # mad - expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' - result = g.mad() - assert_frame_equal(result, expected) - - expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], - index=[0, 1]) - result = gni.mad() - assert_frame_equal(result, expected) - - # describe - expected_index = pd.Index([1, 3], name='A') - expected_col = pd.MultiIndex(levels=[['B'], - ['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']], - labels=[[0] * 8, list(range(8))]) - expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan]], - index=expected_index, - columns=expected_col) - result = g.describe() - assert_frame_equal(result, expected) - - expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, - df[df.A == 3].describe().unstack().to_frame().T]) - expected.index = pd.Index([0, 1]) - result = gni.describe() - assert_frame_equal(result, expected) - - # any - expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' - result = g.any() - assert_frame_equal(result, expected) - - # idxmax - expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' - result = g.idxmax() - assert_frame_equal(result, expected) - - def test_cython_api2(self): - - # this takes the fast apply path - - # cumsum (GH5614) - df = DataFrame( - [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] - ], columns=['A', 'B', 'C']) - expected = DataFrame( - [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) - result = df.groupby('A').cumsum() - assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby('A', as_index=False).cumsum() - assert_frame_equal(result, expected) - - # GH 13994 - result = df.groupby('A').cumsum(axis=1) - expected = df.cumsum(axis=1) - assert_frame_equal(result, expected) - result = df.groupby('A').cumprod(axis=1) - expected = df.cumprod(axis=1) - assert_frame_equal(result, expected) - - def test_grouping_ndarray(self): - grouped = self.df.groupby(self.df['A'].values) - - result = grouped.sum() - expected = self.df.groupby('A').sum() - assert_frame_equal(result, expected, check_names=False - ) # Note: no names when grouping by value - - def test_apply_typecast_fail(self): - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile( - ['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}) - - def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) - return group - - result = df.groupby('d').apply(f) - - expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) - - assert_frame_equal(result, expected) - - def test_apply_multiindex_fail(self): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) - - def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) - return group - - result = df.groupby('d').apply(f) - - expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) - - assert_frame_equal(result, expected) - - def test_apply_corner(self): - result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) - expected = self.tsframe * 2 - assert_frame_equal(result, expected) - - def test_apply_without_copy(self): - # GH 5545 - # returning a non-copy in an applied function fails - - data = DataFrame({'id_field': [100, 100, 200, 300], - 'category': ['a', 'b', 'c', 'c'], - 'value': [1, 2, 3, 4]}) - - def filt1(x): - if x.shape[0] == 1: - return x.copy() - else: - return x[x.category == 'c'] - - def filt2(x): - if x.shape[0] == 1: - return x - else: - return x[x.category == 'c'] - - expected = data.groupby('id_field').apply(filt1) - result = data.groupby('id_field').apply(filt2) - assert_frame_equal(result, expected) - - def test_apply_corner_cases(self): - # #535, can't use sliding iterator - - N = 1000 - labels = np.random.randint(0, 100, size=N) - df = DataFrame({'key': labels, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) - - grouped = df.groupby('key') - - def f(g): - g['value3'] = g['value1'] * 2 - return g - - result = grouped.apply(f) - assert 'value3' in result + assert_frame_equal(result3, expected3) + + # multi-key + + grouped = df.groupby(['A', 'B'], as_index=False) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + result3 = grouped['C'].agg({'Q': np.sum}) + assert_frame_equal(result3, expected3) + + # GH7115 & GH8112 & GH8582 + df = DataFrame(np.random.randint(0, 100, (50, 3)), + columns=['jim', 'joe', 'jolie']) + ts = Series(np.random.randint(5, 10, 50), name='jim') + + gr = df.groupby(ts) + gr.nth(0) # invokes set_selection_from_grouper internally + assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) + + for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: + gr = df.groupby(ts, as_index=False) + left = getattr(gr, attr)() + + gr = df.groupby(ts.values, as_index=True) + right = getattr(gr, attr)().reset_index(drop=True) + + assert_frame_equal(left, right) + + +def test_as_index_series_return_frame(df): + grouped = df.groupby('A', as_index=False) + grouped2 = df.groupby(['A', 'B'], as_index=False) + + result = grouped['C'].agg(np.sum) + expected = grouped.agg(np.sum).loc[:, ['A', 'C']] + assert isinstance(result, DataFrame) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].agg(np.sum) + expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] + assert isinstance(result2, DataFrame) + assert_frame_equal(result2, expected2) + + result = grouped['C'].sum() + expected = grouped.sum().loc[:, ['A', 'C']] + assert isinstance(result, DataFrame) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].sum() + expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] + assert isinstance(result2, DataFrame) + assert_frame_equal(result2, expected2) + + # corner case + pytest.raises(Exception, grouped['C'].__getitem__, 'D') + + +def test_groupby_as_index_cython(df): + data = df + + # single-key + grouped = data.groupby('A', as_index=False) + result = grouped.mean() + expected = data.groupby(['A']).mean() + expected.insert(0, 'A', expected.index) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + # multi-key + grouped = data.groupby(['A', 'B'], as_index=False) + result = grouped.mean() + expected = data.groupby(['A', 'B']).mean() + + arrays = lzip(*expected.index.values) + expected.insert(0, 'A', arrays[0]) + expected.insert(1, 'B', arrays[1]) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + +def test_groupby_as_index_series_scalar(df): + grouped = df.groupby(['A', 'B'], as_index=False) + + # GH #421 + + result = grouped['C'].agg(len) + expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] + assert_frame_equal(result, expected) + + +def test_groupby_as_index_corner(df, ts): + pytest.raises(TypeError, ts.groupby, lambda x: x.weekday(), + as_index=False) + + pytest.raises(ValueError, df.groupby, lambda x: x.lower(), + as_index=False, axis=1) + + +def test_groupby_multiple_key(df): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]) + agged = grouped.sum() + assert_almost_equal(df.values, agged.values) + + grouped = df.T.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day], axis=1) + + agged = grouped.agg(lambda x: x.sum()) + tm.assert_index_equal(agged.index, df.columns) + assert_almost_equal(df.T.values, agged.values) + + agged = grouped.agg(lambda x: x.sum()) + assert_almost_equal(df.T.values, agged.values) + + +def test_groupby_multi_corner(df): + # test that having an all-NA column doesn't mess you up + df = df.copy() + df['bad'] = np.nan + agged = df.groupby(['A', 'B']).mean() + + expected = df.groupby(['A', 'B']).mean() + expected['bad'] = np.nan + + assert_frame_equal(agged, expected) + + +def test_omit_nuisance(df): + grouped = df.groupby('A') + + result = grouped.mean() + expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + assert_frame_equal(result, expected) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + df = df.loc[:, ['A', 'C', 'D']] + df['E'] = datetime.now() + grouped = df.groupby('A') + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_frame_equal(result, expected) + + # won't work with axis = 1 + grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) + result = pytest.raises(TypeError, grouped.agg, + lambda x: x.sum(0, numeric_only=False)) + + +def test_omit_nuisance_python_multiple(three_group): + grouped = three_group.groupby(['A', 'B']) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + +def test_empty_groups_corner(mframe): + # handle empty groups + df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2': np.array(['1', '1', '1', '2', '2', '2']), + 'k3': ['foo', 'bar'] * 3, + 'v1': np.random.randn(6), + 'v2': np.random.randn(6)}) + + grouped = df.groupby(['k1', 'k2']) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + grouped = mframe[3:5].groupby(level=0) + agged = grouped.apply(lambda x: x.mean()) + agged_A = grouped['A'].apply(np.mean) + assert_series_equal(agged['A'], agged_A) + assert agged.index.name == 'first' + + +def test_nonsense_func(): + df = DataFrame([0]) + pytest.raises(Exception, df.groupby, lambda x: x + 'foo') + + +def test_wrap_aggregated_output_multindex(mframe): + df = mframe.T + df['baz', 'two'] = 'peekaboo' + + keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] + agged = df.groupby(keys).agg(np.mean) + assert isinstance(agged.columns, MultiIndex) + + def aggfun(ser): + if ser.name == ('foo', 'one'): + raise TypeError + else: + return ser.sum() + + agged2 = df.groupby(keys).aggregate(aggfun) + assert len(agged2.columns) + 1 == len(df.columns) + + +def test_groupby_level_apply(mframe): + + result = mframe.groupby(level=0).count() + assert result.index.name == 'first' + result = mframe.groupby(level=1).count() + assert result.index.name == 'second' + + result = mframe['A'].groupby(level=0).count() + assert result.index.name == 'first' + + +def test_groupby_level_mapper(mframe): + deleveled = mframe.reset_index() + + mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1} + mapper1 = {'one': 0, 'two': 0, 'three': 1} + + result0 = mframe.groupby(mapper0, level=0).sum() + result1 = mframe.groupby(mapper1, level=1).sum() + + mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) + expected0 = mframe.groupby(mapped_level0).sum() + expected1 = mframe.groupby(mapped_level1).sum() + expected0.index.name, expected1.index.name = 'first', 'second' + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + +def test_groupby_level_nonmulti(): + # GH 1313, GH 13901 + s = Series([1, 2, 3, 10, 4, 5, 20, 6], + Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo')) + expected = Series([11, 22, 3, 4, 5, 6], + Index(range(1, 7), name='foo')) + + result = s.groupby(level=0).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=[0]).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=-1).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=[-1]).sum() + tm.assert_series_equal(result, expected) + + pytest.raises(ValueError, s.groupby, level=1) + pytest.raises(ValueError, s.groupby, level=-2) + pytest.raises(ValueError, s.groupby, level=[]) + pytest.raises(ValueError, s.groupby, level=[0, 0]) + pytest.raises(ValueError, s.groupby, level=[0, 1]) + pytest.raises(ValueError, s.groupby, level=[1]) + + +def test_groupby_complex(): + # GH 12902 + a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) + expected = Series((1 + 2j, 5 + 10j)) + + result = a.groupby(level=0).sum() + assert_series_equal(result, expected) + + result = a.sum(level=0) + assert_series_equal(result, expected) + + +def test_mutate_groups(): + + # GH3380 + + df = DataFrame({ + 'cat1': ['a'] * 8 + ['b'] * 6, + 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + + ['d'] * 2 + ['e'] * 2, + 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)), + 'val': np.random.randint(100, size=14), + }) + + def f_copy(x): + x = x.copy() + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + def f_no_copy(x): + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + grpby_copy = df.groupby('cat1').apply(f_copy) + grpby_no_copy = df.groupby('cat1').apply(f_no_copy) + assert_series_equal(grpby_copy, grpby_no_copy) + + +def test_no_mutate_but_looks_like(): + + # GH 8467 + # first show's mutation indicator + # second does not, but should yield the same results + df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)}) + + result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) + assert_series_equal(result1, result2) + + +def test_groupby_series_indexed_differently(): + s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], + index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) + s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], + index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) + + grouped = s1.groupby(s2) + agged = grouped.mean() + exp = s1.groupby(s2.reindex(s1.index).get).mean() + assert_series_equal(agged, exp) + + +def test_groupby_with_hier_columns(): + tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', + 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', + 'one', 'two']])) + index = MultiIndex.from_tuples(tuples) + columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), ( + 'B', 'cat'), ('A', 'dog')]) + df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) + + result = df.groupby(level=0).mean() + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0, axis=1).mean() + tm.assert_index_equal(result.index, df.index) + + result = df.groupby(level=0).agg(np.mean) + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0).apply(lambda x: x.mean()) + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) + tm.assert_index_equal(result.columns, Index(['A', 'B'])) + tm.assert_index_equal(result.index, df.index) + + # add a nuisance column + sorted_columns, _ = columns.sortlevel(0) + df['A', 'foo'] = 'bar' + result = df.groupby(level=0).mean() + tm.assert_index_equal(result.columns, df.columns[:-1]) - def test_groupby_wrong_multi_labels(self): - data = """index,foo,bar,baz,spam,data + +def test_grouping_ndarray(df): + grouped = df.groupby(df['A'].values) + + result = grouped.sum() + expected = df.groupby('A').sum() + assert_frame_equal(result, expected, check_names=False + ) # Note: no names when grouping by value + + +def test_groupby_wrong_multi_labels(): + data = """index,foo,bar,baz,spam,data 0,foo1,bar1,baz1,spam2,20 1,foo1,bar2,baz1,spam3,30 2,foo2,bar2,baz1,spam2,40 3,foo1,bar1,baz2,spam1,50 4,foo3,bar1,baz2,spam1,60""" - data = read_csv(StringIO(data), index_col=0) - - grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) - - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - def test_groupby_series_with_name(self): - result = self.df.groupby(self.df['A']).mean() - result2 = self.df.groupby(self.df['A'], as_index=False).mean() - assert result.index.name == 'A' - assert 'A' in result2 - - result = self.df.groupby([self.df['A'], self.df['B']]).mean() - result2 = self.df.groupby([self.df['A'], self.df['B']], - as_index=False).mean() - assert result.index.names == ('A', 'B') - assert 'A' in result2 - assert 'B' in result2 - - def test_seriesgroupby_name_attr(self): - # GH 6265 - result = self.df.groupby('A')['C'] - assert result.count().name == 'C' - assert result.mean().name == 'C' - - testFunc = lambda x: np.sum(x) * 2 - assert result.agg(testFunc).name == 'C' - - def test_consistency_name(self): - # GH 12363 - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - expected = df.groupby(['A']).B.count() - result = df.B.groupby(df.A).count() - assert_series_equal(result, expected) - - def test_groupby_name_propagation(self): - # GH 6124 - def summarize(df, name=None): - return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name) - - def summarize_random_name(df): - # Provide a different name for each Series. In this case, groupby - # should not attempt to propagate the Series name since they are - # inconsistent. - return Series({ - 'count': 1, - 'mean': 2, - 'omissions': 3, - }, name=df.iloc[0]['A']) - - metrics = self.df.groupby('A').apply(summarize) - assert metrics.columns.name is None - metrics = self.df.groupby('A').apply(summarize, 'metrics') - assert metrics.columns.name == 'metrics' - metrics = self.df.groupby('A').apply(summarize_random_name) - assert metrics.columns.name is None - - def test_groupby_nonstring_columns(self): - df = DataFrame([np.arange(10) for x in range(10)]) - grouped = df.groupby(0) - result = grouped.mean() - expected = df.groupby(df[0]).mean() - assert_frame_equal(result, expected) - - def test_groupby_mixed_type_columns(self): - # GH 13432, unorderable types in py3 - df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) - expected = DataFrame([[1, 2]], columns=['B', 0], - index=Index([0], name='A')) - - result = df.groupby('A').first() - tm.assert_frame_equal(result, expected) - - result = df.groupby('A').sum() - tm.assert_frame_equal(result, expected) - - def test_cython_grouper_series_bug_noncontig(self): - arr = np.empty((100, 100)) - arr.fill(np.nan) - obj = Series(arr[:, 0], index=lrange(100)) - inds = np.tile(lrange(10), 10) - - result = obj.groupby(inds).agg(Series.median) - assert result.isna().all() - - def test_series_grouper_noncontig_index(self): - index = Index(tm.rands_array(10, 100)) - - values = Series(np.random.randn(50), index=index[::2]) - labels = np.random.randint(0, 5, 50) - - # it works! - grouped = values.groupby(labels) - - # accessing the index elements causes segfault - f = lambda x: len(set(map(id, x.index))) - grouped.agg(f) - - def test_convert_objects_leave_decimal_alone(self): - - from decimal import Decimal - - s = Series(lrange(5)) - labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') - - def convert_fast(x): - return Decimal(str(x.mean())) - - def convert_force_pure(x): - # base will be length 0 - assert (len(x.base) > 0) - return Decimal(str(x.mean())) - - grouped = s.groupby(labels) - - result = grouped.agg(convert_fast) - assert result.dtype == np.object_ - assert isinstance(result[0], Decimal) - - result = grouped.agg(convert_force_pure) - assert result.dtype == np.object_ - assert isinstance(result[0], Decimal) - - def test_fast_apply(self): - # make sure that fast apply is correctly called - # rather than raising any kind of error - # otherwise the python path will be callsed - # which slows things down - N = 1000 - labels = np.random.randint(0, 2000, size=N) - labels2 = np.random.randint(0, 3, size=N) - df = DataFrame({'key': labels, - 'key2': labels2, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) - - def f(g): - return 1 - - g = df.groupby(['key', 'key2']) - - grouper = g.grouper - - splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) - group_keys = grouper._get_group_keys() - - values, mutated = splitter.fast_apply(f, group_keys) - assert not mutated - - def test_apply_with_mixed_dtype(self): - # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 - df = DataFrame({'foo1': np.random.randn(6), - 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) - result = df.apply(lambda x: x, axis=1) - assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) - - # GH 3610 incorrect dtype conversion with as_index=False - df = DataFrame({"c1": [1, 2, 6, 6, 8]}) - df["c2"] = df.c1 / 2.0 - result1 = df.groupby("c2").mean().reset_index().c2 - result2 = df.groupby("c2", as_index=False).mean().c2 - assert_series_equal(result1, result2) - - def test_groupby_aggregation_mixed_dtype(self): - - # GH 6212 - expected = DataFrame({ - 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1], - 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]}, - index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99), - ('big', 'damp'), - ('blue', 'dry'), - ('red', 'red'), ('red', 'wet')], - names=['by1', 'by2'])) - - df = DataFrame({ - 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], - 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, - 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, - np.nan, np.nan] - }) - - g = df.groupby(['by1', 'by2']) - result = g[['v1', 'v2']].mean() - assert_frame_equal(result, expected) - - def test_groupby_dtype_inference_empty(self): - # GH 6733 - df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')}) - assert df['x'].dtype == np.float64 - - result = df.groupby('x').first() - exp_index = Index([], name='x', dtype=np.float64) - expected = DataFrame({'range': Series( - [], index=exp_index, dtype='int64')}) - assert_frame_equal(result, expected, by_blocks=True) - - def test_groupby_list_infer_array_like(self): - result = self.df.groupby(list(self.df['A'])).mean() - expected = self.df.groupby(self.df['A']).mean() - assert_frame_equal(result, expected, check_names=False) - - pytest.raises(Exception, self.df.groupby, list(self.df['A'][:-1])) - - # pathological case of ambiguity - df = DataFrame({'foo': [0, 1], - 'bar': [3, 4], - 'val': np.random.randn(2)}) - - result = df.groupby(['foo', 'bar']).mean() - expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] - - def test_groupby_keys_same_size_as_index(self): - # GH 11185 - freq = 's' - index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'), - periods=2, freq=freq) - df = pd.DataFrame([['A', 10], ['B', 15]], columns=[ - 'metric', 'values' - ], index=index) - result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean() - expected = df.set_index([df.index, 'metric']) - - assert_frame_equal(result, expected) - - def test_groupby_one_row(self): - # GH 11741 - df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD')) - pytest.raises(KeyError, df1.groupby, 'Z') - df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD')) - pytest.raises(KeyError, df2.groupby, 'Z') - - def test_groupby_nat_exclude(self): - # GH 6992 - df = pd.DataFrame( - {'values': np.random.randn(8), - 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp( - '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan, - pd.Timestamp('2013-01-01')], - 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) - grouped = df.groupby('dt') - - expected = [pd.Index([1, 7]), pd.Index([3, 5])] - keys = sorted(grouped.groups.keys()) - assert len(keys) == 2 - for k, e in zip(keys, expected): - # grouped.groups keys are np.datetime64 with system tz - # not to be affected by tz, only compare values - tm.assert_index_equal(grouped.groups[k], e) - - # confirm obj is not filtered - tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) - assert grouped.ngroups == 2 - - expected = { - Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64), - Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64) - } - - for k in grouped.indices: - tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) - - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + data = read_csv(StringIO(data), index_col=0) + + grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + +def test_groupby_series_with_name(df): + result = df.groupby(df['A']).mean() + result2 = df.groupby(df['A'], as_index=False).mean() + assert result.index.name == 'A' + assert 'A' in result2 + + result = df.groupby([df['A'], df['B']]).mean() + result2 = df.groupby([df['A'], df['B']], + as_index=False).mean() + assert result.index.names == ('A', 'B') + assert 'A' in result2 + assert 'B' in result2 + + +def test_seriesgroupby_name_attr(df): + # GH 6265 + result = df.groupby('A')['C'] + assert result.count().name == 'C' + assert result.mean().name == 'C' + + testFunc = lambda x: np.sum(x) * 2 + assert result.agg(testFunc).name == 'C' + + +def test_consistency_name(): + # GH 12363 + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + expected = df.groupby(['A']).B.count() + result = df.B.groupby(df.A).count() + assert_series_equal(result, expected) + + +def test_groupby_name_propagation(df): + # GH 6124 + def summarize(df, name=None): + return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name) + + def summarize_random_name(df): + # Provide a different name for each Series. In this case, groupby + # should not attempt to propagate the Series name since they are + # inconsistent. + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=df.iloc[0]['A']) + + metrics = df.groupby('A').apply(summarize) + assert metrics.columns.name is None + metrics = df.groupby('A').apply(summarize, 'metrics') + assert metrics.columns.name == 'metrics' + metrics = df.groupby('A').apply(summarize_random_name) + assert metrics.columns.name is None + + +def test_groupby_nonstring_columns(): + df = DataFrame([np.arange(10) for x in range(10)]) + grouped = df.groupby(0) + result = grouped.mean() + expected = df.groupby(df[0]).mean() + assert_frame_equal(result, expected) + + +def test_groupby_mixed_type_columns(): + # GH 13432, unorderable types in py3 + df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) + expected = DataFrame([[1, 2]], columns=['B', 0], + index=Index([0], name='A')) + + result = df.groupby('A').first() + tm.assert_frame_equal(result, expected) + + result = df.groupby('A').sum() + tm.assert_frame_equal(result, expected) + + +def test_cython_grouper_series_bug_noncontig(): + arr = np.empty((100, 100)) + arr.fill(np.nan) + obj = Series(arr[:, 0], index=lrange(100)) + inds = np.tile(lrange(10), 10) + + result = obj.groupby(inds).agg(Series.median) + assert result.isna().all() + + +def test_series_grouper_noncontig_index(): + index = Index(tm.rands_array(10, 100)) + + values = Series(np.random.randn(50), index=index[::2]) + labels = np.random.randint(0, 5, 50) + + # it works! + grouped = values.groupby(labels) + + # accessing the index elements causes segfault + f = lambda x: len(set(map(id, x.index))) + grouped.agg(f) + +def test_convert_objects_leave_decimal_alone(): + + s = Series(lrange(5)) + labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') + + def convert_fast(x): + return Decimal(str(x.mean())) + + def convert_force_pure(x): + # base will be length 0 + assert (len(x.base) > 0) + return Decimal(str(x.mean())) + + grouped = s.groupby(labels) + + result = grouped.agg(convert_fast) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) + + result = grouped.agg(convert_force_pure) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) + + +def test_groupby_dtype_inference_empty(): + # GH 6733 + df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')}) + assert df['x'].dtype == np.float64 + + result = df.groupby('x').first() + exp_index = Index([], name='x', dtype=np.float64) + expected = DataFrame({'range': Series( + [], index=exp_index, dtype='int64')}) + assert_frame_equal(result, expected, by_blocks=True) + + +def test_groupby_list_infer_array_like(df): + result = df.groupby(list(df['A'])).mean() + expected = df.groupby(df['A']).mean() + assert_frame_equal(result, expected, check_names=False) + + pytest.raises(Exception, df.groupby, list(df['A'][:-1])) + + # pathological case of ambiguity + df = DataFrame({'foo': [0, 1], + 'bar': [3, 4], + 'val': np.random.randn(2)}) + + result = df.groupby(['foo', 'bar']).mean() + expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + + +def test_groupby_keys_same_size_as_index(): + # GH 11185 + freq = 's' + index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'), + periods=2, freq=freq) + df = pd.DataFrame([['A', 10], ['B', 15]], columns=[ + 'metric', 'values' + ], index=index) + result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean() + expected = df.set_index([df.index, 'metric']) + + assert_frame_equal(result, expected) + + +def test_groupby_one_row(): + # GH 11741 + df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD')) + pytest.raises(KeyError, df1.groupby, 'Z') + df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD')) + pytest.raises(KeyError, df2.groupby, 'Z') + + +def test_groupby_nat_exclude(): + # GH 6992 + df = pd.DataFrame( + {'values': np.random.randn(8), + 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp( + '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan, + pd.Timestamp('2013-01-01')], + 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) + grouped = df.groupby('dt') + + expected = [pd.Index([1, 7]), pd.Index([3, 5])] + keys = sorted(grouped.groups.keys()) + assert len(keys) == 2 + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + tm.assert_index_equal(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + assert grouped.ngroups == 2 + + expected = { + Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64), + Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64) + } + + for k in grouped.indices: + tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal( + grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) + tm.assert_frame_equal( + grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + + pytest.raises(KeyError, grouped.get_group, pd.NaT) + + nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], + 'nat': [pd.NaT, pd.NaT, pd.NaT]}) + assert nan_df['nan'].dtype == 'float64' + assert nan_df['nat'].dtype == 'datetime64[ns]' + + for key in ['nan', 'nat']: + grouped = nan_df.groupby(key) + assert grouped.groups == {} + assert grouped.ngroups == 0 + assert grouped.indices == {} + pytest.raises(KeyError, grouped.get_group, np.nan) pytest.raises(KeyError, grouped.get_group, pd.NaT) - nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], - 'nat': [pd.NaT, pd.NaT, pd.NaT]}) - assert nan_df['nan'].dtype == 'float64' - assert nan_df['nat'].dtype == 'datetime64[ns]' - - for key in ['nan', 'nat']: - grouped = nan_df.groupby(key) - assert grouped.groups == {} - assert grouped.ngroups == 0 - assert grouped.indices == {} - pytest.raises(KeyError, grouped.get_group, np.nan) - pytest.raises(KeyError, grouped.get_group, pd.NaT) - - def test_sparse_friendly(self): - sdf = self.df[['C', 'D']].to_sparse() - with catch_warnings(record=True): - panel = tm.makePanel() - tm.add_nans(panel) - - def _check_work(gp): - gp.mean() - gp.agg(np.mean) - dict(iter(gp)) - - # it works! - _check_work(sdf.groupby(lambda x: x // 2)) - _check_work(sdf['C'].groupby(lambda x: x // 2)) - _check_work(sdf.groupby(self.df['A'])) - - # do this someday - # _check_work(panel.groupby(lambda x: x.month, axis=1)) - - def test_panel_groupby(self): - with catch_warnings(record=True): - self.panel = tm.makePanel() - tm.add_nans(self.panel) - grouped = self.panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1}, - axis='items') - agged = grouped.mean() - agged2 = grouped.agg(lambda x: x.mean('items')) - - tm.assert_panel_equal(agged, agged2) - - tm.assert_index_equal(agged.items, Index([0, 1])) - - grouped = self.panel.groupby(lambda x: x.month, axis='major') - agged = grouped.mean() - - exp = Index(sorted(list(set(self.panel.major_axis.month)))) - tm.assert_index_equal(agged.major_axis, exp) - - grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, - axis='minor') - agged = grouped.mean() - tm.assert_index_equal(agged.minor_axis, Index([0, 1])) - - def test_groupby_2d_malformed(self): - d = DataFrame(index=lrange(2)) - d['group'] = ['g1', 'g2'] - d['zeros'] = [0, 0] - d['ones'] = [1, 1] - d['label'] = ['l1', 'l2'] - tmp = d.groupby(['group']).mean() - res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) - tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) - tm.assert_numpy_array_equal(tmp.values, res_values) - - def test_int32_overflow(self): - B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000) - )) - A = np.arange(25000) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': np.random.randn(25000)}) - - left = df.groupby(['A', 'B', 'C', 'D']).sum() - right = df.groupby(['D', 'C', 'B', 'A']).sum() - assert len(left) == len(right) - - def test_groupby_sort_multi(self): - df = DataFrame({'a': ['foo', 'bar', 'baz'], - 'b': [3, 2, 1], - 'c': [0, 1, 2], - 'd': np.random.randn(3)}) - - tups = lmap(tuple, df[['a', 'b', 'c']].values) - tups = com._asarray_tuplesafe(tups) - result = df.groupby(['a', 'b', 'c'], sort=True).sum() - tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) - tups = lmap(tuple, df[['c', 'a', 'b']].values) - tups = com._asarray_tuplesafe(tups) - result = df.groupby(['c', 'a', 'b'], sort=True).sum() - tm.assert_numpy_array_equal(result.index.values, tups) +def test_sparse_friendly(df): + sdf = df[['C', 'D']].to_sparse() + with catch_warnings(record=True): + panel = tm.makePanel() + tm.add_nans(panel) + + def _check_work(gp): + gp.mean() + gp.agg(np.mean) + dict(iter(gp)) + + # it works! + _check_work(sdf.groupby(lambda x: x // 2)) + _check_work(sdf['C'].groupby(lambda x: x // 2)) + _check_work(sdf.groupby(df['A'])) + + # do this someday + # _check_work(panel.groupby(lambda x: x.month, axis=1)) + + +def test_panel_groupby(): + with catch_warnings(record=True): + panel = tm.makePanel() + tm.add_nans(panel) + grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1}, + axis='items') + agged = grouped.mean() + agged2 = grouped.agg(lambda x: x.mean('items')) + + tm.assert_panel_equal(agged, agged2) + + tm.assert_index_equal(agged.items, Index([0, 1])) - tups = lmap(tuple, df[['b', 'c', 'a']].values) + grouped = panel.groupby(lambda x: x.month, axis='major') + agged = grouped.mean() + + exp = Index(sorted(list(set(panel.major_axis.month)))) + tm.assert_index_equal(agged.major_axis, exp) + + grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, + axis='minor') + agged = grouped.mean() + tm.assert_index_equal(agged.minor_axis, Index([0, 1])) + + +def test_groupby_2d_malformed(): + d = DataFrame(index=lrange(2)) + d['group'] = ['g1', 'g2'] + d['zeros'] = [0, 0] + d['ones'] = [1, 1] + d['label'] = ['l1', 'l2'] + tmp = d.groupby(['group']).mean() + res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) + tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) + tm.assert_numpy_array_equal(tmp.values, res_values) + + +def test_int32_overflow(): + B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000) + )) + A = np.arange(25000) + df = DataFrame({'A': A, + 'B': B, + 'C': A, + 'D': B, + 'E': np.random.randn(25000)}) + + left = df.groupby(['A', 'B', 'C', 'D']).sum() + right = df.groupby(['D', 'C', 'B', 'A']).sum() + assert len(left) == len(right) + + +def test_groupby_sort_multi(): + df = DataFrame({'a': ['foo', 'bar', 'baz'], + 'b': [3, 2, 1], + 'c': [0, 1, 2], + 'd': np.random.randn(3)}) + + tups = lmap(tuple, df[['a', 'b', 'c']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['a', 'b', 'c'], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) + + tups = lmap(tuple, df[['c', 'a', 'b']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['c', 'a', 'b'], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups) + + tups = lmap(tuple, df[['b', 'c', 'a']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['b', 'c', 'a'], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) + + df = DataFrame({'a': [0, 1, 2, 0, 1, 2], + 'b': [0, 0, 0, 1, 1, 1], + 'd': np.random.randn(6)}) + grouped = df.groupby(['a', 'b'])['d'] + result = grouped.sum() + + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) - result = df.groupby(['b', 'c', 'a'], sort=True).sum() - tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) - - df = DataFrame({'a': [0, 1, 2, 0, 1, 2], - 'b': [0, 0, 0, 1, 1, 1], - 'd': np.random.randn(6)}) - grouped = df.groupby(['a', 'b'])['d'] - result = grouped.sum() - _check_groupby(df, result, ['a', 'b'], 'd') - - def test_intercept_builtin_sum(self): - s = Series([1., 2., np.nan, 3.]) - grouped = s.groupby([0, 1, 2, 2]) - - result = grouped.agg(builtins.sum) - result2 = grouped.apply(builtins.sum) - expected = grouped.sum() - assert_series_equal(result, expected) - assert_series_equal(result2, expected) - - def test_rank_apply(self): - lev1 = tm.rands_array(10, 100) - lev2 = tm.rands_array(10, 130) - lab1 = np.random.randint(0, 100, size=500) - lab2 = np.random.randint(0, 130, size=500) - - df = DataFrame({'value': np.random.randn(500), - 'key1': lev1.take(lab1), - 'key2': lev2.take(lab2)}) - - result = df.groupby(['key1', 'key2']).value.rank() - - expected = [] - for key, piece in df.groupby(['key1', 'key2']): - expected.append(piece.value.rank()) - expected = concat(expected, axis=0) - expected = expected.reindex(result.index) - assert_series_equal(result, expected) - - result = df.groupby(['key1', 'key2']).value.rank(pct=True) - - expected = [] - for key, piece in df.groupby(['key1', 'key2']): - expected.append(piece.value.rank(pct=True)) - expected = concat(expected, axis=0) - expected = expected.reindex(result.index) - assert_series_equal(result, expected) - - @pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) - @pytest.mark.parametrize("vals", [ - [2, 2, 8, 2, 6], - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06')]]) - @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ - ('average', True, False, [2., 2., 5., 2., 4.]), - ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), - ('average', False, False, [4., 4., 1., 4., 2.]), - ('average', False, True, [.8, .8, .2, .8, .4]), - ('min', True, False, [1., 1., 5., 1., 4.]), - ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), - ('min', False, False, [3., 3., 1., 3., 2.]), - ('min', False, True, [.6, .6, .2, .6, .4]), - ('max', True, False, [3., 3., 5., 3., 4.]), - ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), - ('max', False, False, [5., 5., 1., 5., 2.]), - ('max', False, True, [1., 1., .2, 1., .4]), - ('first', True, False, [1., 2., 5., 3., 4.]), - ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), - ('first', False, False, [3., 4., 1., 5., 2.]), - ('first', False, True, [.6, .8, .2, 1., .4]), - ('dense', True, False, [1., 1., 3., 1., 2.]), - ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), - ('dense', False, False, [3., 3., 1., 3., 2.]), - ('dense', False, True, [.6, .6, .2, .6, .4]), - ]) - def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): - key = np.repeat(grps, len(vals)) - vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, pct=pct) - - exp_df = DataFrame(exp * len(grps), columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) - @pytest.mark.parametrize("vals", [ - [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], - ]) - @pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ - ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), - ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), - ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), - ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), - ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), - ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), - ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), - ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), - ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), - ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), - ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), - ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), - ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), - ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), - ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), - ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), - ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), - ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), - ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), - ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), - ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), - ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), - ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), - ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), - ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), - ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), - ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), - ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), - ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), - ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) - ]) - def test_infs_n_nans(self, grps, vals, ties_method, ascending, na_option, - exp): - # GH 20561 - key = np.repeat(grps, len(vals)) - vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option) - exp_df = DataFrame(exp * len(grps), columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) - @pytest.mark.parametrize("vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06'), np.nan, np.nan] - ]) - @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ - ('average', True, 'keep', False, - [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), - ('average', True, 'keep', True, - [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), - ('average', False, 'keep', False, - [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), - ('average', False, 'keep', True, - [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), - ('min', True, 'keep', False, - [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), - ('min', True, 'keep', True, - [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), - ('min', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('min', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), - ('max', True, 'keep', False, - [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), - ('max', True, 'keep', True, - [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('max', False, 'keep', False, - [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), - ('max', False, 'keep', True, - [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('first', True, 'keep', False, - [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), - ('first', True, 'keep', True, - [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('first', False, 'keep', False, - [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), - ('first', False, 'keep', True, - [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('dense', True, 'keep', False, - [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), - ('dense', True, 'keep', True, - [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), - ('dense', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('dense', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), - ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), - ('average', True, 'no_na', True, - [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), - ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), - ('average', False, 'no_na', True, - [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), - ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), - ('min', True, 'no_na', True, - [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), - ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), - ('min', False, 'no_na', True, - [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), - ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), - ('max', True, 'no_na', True, - [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), - ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), - ('max', False, 'no_na', True, - [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), - ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), - ('first', True, 'no_na', True, - [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), - ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), - ('first', False, 'no_na', True, - [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), - ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), - ('dense', True, 'no_na', True, - [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), - ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), - ('dense', False, 'no_na', True, - [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) - ]) - def test_rank_args_missing(self, grps, vals, ties_method, ascending, - na_option, pct, exp): - key = np.repeat(grps, len(vals)) - vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) - - exp_df = DataFrame(exp * len(grps), columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("pct,exp", [ - (False, [3., 3., 3., 3., 3.]), - (True, [.6, .6, .6, .6, .6])]) - def test_rank_resets_each_group(self, pct, exp): - df = DataFrame( - {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], - 'val': [1] * 10} - ) - result = df.groupby('key').rank(pct=pct) - exp_df = DataFrame(exp * 2, columns=['val']) - assert_frame_equal(result, exp_df) - - def test_rank_avg_even_vals(self): - df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) - result = df.groupby('key').rank() - exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("ties_method", [ - 'average', 'min', 'max', 'first', 'dense']) - @pytest.mark.parametrize("ascending", [True, False]) - @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) - @pytest.mark.parametrize("pct", [True, False]) - @pytest.mark.parametrize("vals", [ - ['bar', 'bar', 'foo', 'bar', 'baz'], - ['bar', np.nan, 'foo', np.nan, 'baz'] - ]) - def test_rank_object_raises(self, ties_method, ascending, na_option, - pct, vals): - df = DataFrame({'key': ['foo'] * 5, 'val': vals}) - with tm.assert_raises_regex(TypeError, "not callable"): - df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) - - @pytest.mark.parametrize("agg_func", ['any', 'all']) - @pytest.mark.parametrize("skipna", [True, False]) - @pytest.mark.parametrize("vals", [ - ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], - [1, 2, 3], [1, 0, 0], [0, 0, 0], - [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], - [True, True, True], [True, False, False], [False, False, False], - [np.nan, np.nan, np.nan] - ]) - def test_groupby_bool_aggs(self, agg_func, skipna, vals): - df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) - - # Figure out expectation using Python builtin - exp = getattr(compat.builtins, agg_func)(vals) - - # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == 'any': - exp = False - - exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index( - ['a', 'b'], name='key')) - result = getattr(df.groupby('key'), agg_func)(skipna=skipna) - assert_frame_equal(result, exp_df) - - def test_dont_clobber_name_column(self): - df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], - 'name': ['foo', 'bar', 'baz'] * 2}) - - result = df.groupby('key').apply(lambda x: x) - assert_frame_equal(result, df) - - def test_skip_group_keys(self): - from pandas import concat - - tsf = tm.makeTimeDataFrame() - - grouped = tsf.groupby(lambda x: x.month, group_keys=False) - result = grouped.apply(lambda x: x.sort_values(by='A')[:3]) - - pieces = [] - for key, group in grouped: - pieces.append(group.sort_values(by='A')[:3]) - - expected = concat(pieces) - assert_frame_equal(result, expected) - - grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) - result = grouped.apply(lambda x: x.sort_values()[:3]) - - pieces = [] - for key, group in grouped: - pieces.append(group.sort_values()[:3]) - - expected = concat(pieces) - assert_series_equal(result, expected) - - def test_no_nonsense_name(self): - # GH #995 - s = self.frame['C'].copy() - s.name = None - - result = s.groupby(self.frame['A']).agg(np.sum) - assert result.name is None - - def test_multifunc_sum_bug(self): - # GH #1065 - x = DataFrame(np.arange(9).reshape(3, 3)) - x['test'] = 0 - x['fl'] = [1.3, 1.5, 1.6] - - grouped = x.groupby('test') - result = grouped.agg({'fl': 'sum', 2: 'size'}) - assert result['fl'].dtype == np.float64 - - def test_handle_dict_return_value(self): - def f(group): - return {'max': group.max(), 'min': group.min()} - - def g(group): - return Series({'max': group.max(), 'min': group.min()}) - - result = self.df.groupby('A')['C'].apply(f) - expected = self.df.groupby('A')['C'].apply(g) - - assert isinstance(result, Series) - assert_series_equal(result, expected) - - def test_set_group_name(self): - def f(group): - assert group.name is not None - return group - - def freduce(group): - assert group.name is not None - return group.sum() - - def foo(x): - return freduce(x) - - def _check_all(grouped): - # make sure all these work - grouped.apply(f) - grouped.aggregate(freduce) - grouped.aggregate({'C': freduce, 'D': freduce}) - grouped.transform(f) - - grouped['C'].apply(f) - grouped['C'].aggregate(freduce) - grouped['C'].aggregate([freduce, foo]) - grouped['C'].transform(f) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) - _check_all(self.df.groupby('A')) - _check_all(self.df.groupby(['A', 'B'])) - - def test_group_name_available_in_inference_pass(self): - # gh-15062 - df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) - - names = [] - - def f(group): - names.append(group.name) - return group.copy() - - df.groupby('a', sort=False, group_keys=False).apply(f) - # we expect 2 zeros because we call ``f`` once to see if a faster route - # can be used. - expected_names = [0, 0, 1, 2] - assert names == expected_names - - def test_no_dummy_key_names(self): - # see gh-1291 - result = self.df.groupby(self.df['A'].values).sum() - assert result.index.name is None - - result = self.df.groupby([self.df['A'].values, self.df['B'].values - ]).sum() - assert result.index.names == (None, None) - - def test_groupby_sort_multiindex_series(self): - # series multiindex groupby sort argument was not being passed through - # _compress_group_index - # GH 9444 - index = MultiIndex(levels=[[1, 2], [1, 2]], - labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], - names=['a', 'b']) - mseries = Series([0, 1, 2, 3, 4, 5], index=index) - index = MultiIndex(levels=[[1, 2], [1, 2]], - labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) - mseries_result = Series([0, 2, 4], index=index) - - result = mseries.groupby(level=['a', 'b'], sort=False).first() - assert_series_equal(result, mseries_result) - result = mseries.groupby(level=['a', 'b'], sort=True).first() - assert_series_equal(result, mseries_result.sort_index()) - - def test_groupby_reindex_inside_function(self): - - periods = 1000 - ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) - df = DataFrame({'high': np.arange( - periods), 'low': np.arange(periods)}, index=ind) - - def agg_before(hour, func, fix=False): - """ - Run an aggregate func on the subset of data. - """ - - def _func(data): - d = data.loc[data.index.map( - lambda x: x.hour < 11)].dropna() - if fix: - data[data.index[0]] - if len(d) == 0: - return None - return func(d) - - return _func - - def afunc(data): - d = data.select(lambda x: x.hour < 11).dropna() - return np.max(d) - - grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) - closure_bad = grouped.agg({'high': agg_before(11, np.max)}) - closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) - - assert_frame_equal(closure_bad, closure_good) - - def test_cython_median(self): - df = DataFrame(np.random.randn(1000)) - df.values[::2] = np.nan - - labels = np.random.randint(0, 50, size=1000).astype(float) - labels[::17] = np.nan - - result = df.groupby(labels).median() - exp = df.groupby(labels).agg(nanops.nanmedian) - assert_frame_equal(result, exp) - - df = DataFrame(np.random.randn(1000, 5)) - rs = df.groupby(labels).agg(np.median) - xp = df.groupby(labels).median() - assert_frame_equal(rs, xp) - - def test_median_empty_bins(self): - df = pd.DataFrame(np.random.randint(0, 44, 500)) - - grps = range(0, 55, 5) - bins = pd.cut(df[0], grps) - - result = df.groupby(bins).median() - expected = df.groupby(bins).agg(lambda x: x.median()) - assert_frame_equal(result, expected) - - @pytest.mark.parametrize("dtype", [ - 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) - @pytest.mark.parametrize("method,data", [ - ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}), - ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}) - ]) - def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): - # GH9311, GH6620 - df = pd.DataFrame( - [{'a': 1, 'b': 1}, - {'a': 1, 'b': 2}, - {'a': 2, 'b': 3}, - {'a': 2, 'b': 4}]) - - df['b'] = df.b.astype(dtype) - - if 'args' not in data: - data['args'] = [] - - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype - - exp = data['df'] - df_out = pd.DataFrame(exp) - - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) - - grpd = df.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) - - def test_groupby_non_arithmetic_agg_intlike_precision(self): - # GH9311, GH6620 - c = 24650000000000000 - - inputs = ((Timestamp('2011-01-15 12:50:28.502376'), - Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c)) - - for i in inputs: - df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}]) - - grp_exp = {'first': {'expected': i[0]}, - 'last': {'expected': i[1]}, - 'min': {'expected': i[0]}, - 'max': {'expected': i[1]}, - 'nth': {'expected': i[1], - 'args': [1]}, - 'count': {'expected': 2}} - - for method, data in compat.iteritems(grp_exp): - if 'args' not in data: - data['args'] = [] - - grpd = df.groupby('a') - res = getattr(grpd, method)(*data['args']) - assert res.iloc[0].b == data['expected'] - - def test_groupby_multiindex_missing_pair(self): - # GH9049 - df = DataFrame({'group1': ['a', 'a', 'a', 'b'], - 'group2': ['c', 'c', 'd', 'c'], - 'value': [1, 1, 1, 5]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) - - res = df_grouped.agg('sum') - idx = MultiIndex.from_tuples( - [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2']) - exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) - - tm.assert_frame_equal(res, exp) - - def test_groupby_multiindex_not_lexsorted(self): - # GH 11640 - - # define the lexsorted version - lexsorted_mi = MultiIndex.from_tuples( - [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) - lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) - assert lexsorted_df.columns.is_lexsorted() - - # define the non-lexsorted version - not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], - [1, 'b2', 'c2', 4]]) - not_lexsorted_df = not_lexsorted_df.pivot_table( - index='a', columns=['b', 'c'], values='d') - not_lexsorted_df = not_lexsorted_df.reset_index() - assert not not_lexsorted_df.columns.is_lexsorted() - - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - - expected = lexsorted_df.groupby('a').mean() - with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.groupby('a').mean() - tm.assert_frame_equal(expected, result) - - # a transforming function should work regardless of sort - # GH 14776 - df = DataFrame({'x': ['a', 'a', 'b', 'a'], - 'y': [1, 1, 2, 2], - 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) - assert not df.index.is_lexsorted() - - for level in [0, 1, [0, 1]]: - for sort in [False, True]: - result = df.groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) - expected = df - tm.assert_frame_equal(expected, result) - - result = df.sort_index().groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) - expected = df.sort_index() - tm.assert_frame_equal(expected, result) - - def test_gb_apply_list_of_unequal_len_arrays(self): - - # GH1738 - df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a', - 'b', 'b', 'b'], - 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd', - 'd', 'd', 'e'], - 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], - 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) - - def noddy(value, weight): - out = np.array(value * weight).repeat(3) - return out - - # the kernel function returns arrays of unequal length - # pandas sniffs the first one, sees it's an array and not - # a list, and assumed the rest are of equal length - # and so tries a vstack - - # don't die - df_grouped.apply(lambda x: noddy(x.value, x.weight)) - - def test_fill_constistency(self): - - # GH9221 - # pass thru keyword arguments to the generated wrapper - # are set if the passed kw is None (only) - df = DataFrame(index=pd.MultiIndex.from_product( - [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]), - columns=Index( - ['1', '2'], name='id')) - df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan, - np.nan, 22, np.nan] - df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan, - np.nan, 44, np.nan] - - expected = df.groupby(level=0, axis=0).fillna(method='ffill') - result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T - assert_frame_equal(result, expected) - - def test_index_label_overlaps_location(self): - # checking we don't have any label/location confusion in the - # the wake of GH5375 - df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) - g = df.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = df.iloc[[1, 3, 4]] - assert_frame_equal(actual, expected) - - ser = df[0] - g = ser.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = ser.take([1, 3, 4]) - assert_series_equal(actual, expected) - - # ... and again, with a generic Index of floats - df.index = df.index.astype(float) - g = df.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = df.iloc[[1, 3, 4]] - assert_frame_equal(actual, expected) - - ser = df[0] - g = ser.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = ser.take([1, 3, 4]) - assert_series_equal(actual, expected) - - def test_groupby_cumprod(self): - # GH 4095 - df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) - - actual = df.groupby('key')['value'].cumprod() - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' - tm.assert_series_equal(actual, expected) - - df = pd.DataFrame({'key': ['b'] * 100, 'value': 2}) - actual = df.groupby('key')['value'].cumprod() - # if overflows, groupby product casts to float - # while numpy passes back invalid values - df['value'] = df['value'].astype(float) - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' - tm.assert_series_equal(actual, expected) - - def test_ops_general(self): - ops = [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), ] - try: - from scipy.stats import sem - except ImportError: - pass - else: - ops.append(('sem', sem)) - df = DataFrame(np.random.randn(1000)) - labels = np.random.randint(0, 50, size=1000).astype(float) - - for op, targop in ops: - result = getattr(df.groupby(labels), op)().astype(float) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise - - def test_max_nan_bug(self): - raw = """,Date,app,File -2013-04-23,2013-04-23 00:00:00,,log080001.log -2013-05-06,2013-05-06 00:00:00,,log.log -2013-05-07,2013-05-07 00:00:00,OE,xlsx""" - - df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby('Date') - r = gb[['File']].max() - e = gb['File'].max().to_frame() - tm.assert_frame_equal(r, e) - assert not r['File'].isna().any() - - def test_nlargest(self): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) - gb = a.groupby(b) - r = gb.nlargest(3) - e = Series([ - 7, 5, 3, 10, 9, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]])) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series([ - 3, 2, 1, 3, 3, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) - assert_series_equal(gb.nlargest(3, keep='last'), e) - - def test_nsmallest(self): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) - gb = a.groupby(b) - r = gb.nsmallest(3) - e = Series([ - 1, 2, 3, 0, 4, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series([ - 0, 1, 1, 0, 1, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) - assert_series_equal(gb.nsmallest(3, keep='last'), e) - - def test_transform_doesnt_clobber_ints(self): - # GH 7972 - n = 6 - x = np.arange(n) - df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x}) - df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x}) - - gb = df.groupby('a') - result = gb.transform('mean') - - gb2 = df2.groupby('a') - expected = gb2.transform('mean') - tm.assert_frame_equal(result, expected) - - def test_groupby_apply_all_none(self): - # Tests to make sure no errors if apply function returns all None - # values. Issue 9684. - test_df = DataFrame({'groups': [0, 0, 1, 1], - 'random_vars': [8, 7, 4, 5]}) - - def test_func(x): - pass - - result = test_df.groupby('groups').apply(test_func) - expected = DataFrame() - tm.assert_frame_equal(result, expected) - - def test_groupby_apply_none_first(self): - # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) - test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) - - def test_func(x): - if x.shape[0] < 2: + _check_groupby(df, result, ['a', 'b'], 'd') + + +def test_dont_clobber_name_column(): + df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], + 'name': ['foo', 'bar', 'baz'] * 2}) + + result = df.groupby('key').apply(lambda x: x) + assert_frame_equal(result, df) + + +def test_skip_group_keys(): + + tsf = tm.makeTimeDataFrame() + + grouped = tsf.groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_values(by='A')[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.sort_values(by='A')[:3]) + + expected = pd.concat(pieces) + assert_frame_equal(result, expected) + + grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_values()[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.sort_values()[:3]) + + expected = pd.concat(pieces) + assert_series_equal(result, expected) + + +def test_no_nonsense_name(frame): + # GH #995 + s = frame['C'].copy() + s.name = None + + result = s.groupby(frame['A']).agg(np.sum) + assert result.name is None + + +def test_multifunc_sum_bug(): + # GH #1065 + x = DataFrame(np.arange(9).reshape(3, 3)) + x['test'] = 0 + x['fl'] = [1.3, 1.5, 1.6] + + grouped = x.groupby('test') + result = grouped.agg({'fl': 'sum', 2: 'size'}) + assert result['fl'].dtype == np.float64 + + +def test_handle_dict_return_value(df): + def f(group): + return {'max': group.max(), 'min': group.min()} + + def g(group): + return Series({'max': group.max(), 'min': group.min()}) + + result = df.groupby('A')['C'].apply(f) + expected = df.groupby('A')['C'].apply(g) + + assert isinstance(result, Series) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('grouper', ['A', ['A', 'B']]) +def test_set_group_name(df, grouper): + def f(group): + assert group.name is not None + return group + + def freduce(group): + assert group.name is not None + return group.sum() + + def foo(x): + return freduce(x) + + grouped = df.groupby(grouper) + + # make sure all these work + grouped.apply(f) + grouped.aggregate(freduce) + grouped.aggregate({'C': freduce, 'D': freduce}) + grouped.transform(f) + + grouped['C'].apply(f) + grouped['C'].aggregate(freduce) + grouped['C'].aggregate([freduce, foo]) + grouped['C'].transform(f) + + +def test_group_name_available_in_inference_pass(): + # gh-15062 + df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) + + names = [] + + def f(group): + names.append(group.name) + return group.copy() + + df.groupby('a', sort=False, group_keys=False).apply(f) + # we expect 2 zeros because we call ``f`` once to see if a faster route + # can be used. + expected_names = [0, 0, 1, 2] + assert names == expected_names + + +def test_no_dummy_key_names(df): + # see gh-1291 + result = df.groupby(df['A'].values).sum() + assert result.index.name is None + + result = df.groupby([df['A'].values, df['B'].values]).sum() + assert result.index.names == (None, None) + + +def test_groupby_sort_multiindex_series(): + # series multiindex groupby sort argument was not being passed through + # _compress_group_index + # GH 9444 + index = MultiIndex(levels=[[1, 2], [1, 2]], + labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], + names=['a', 'b']) + mseries = Series([0, 1, 2, 3, 4, 5], index=index) + index = MultiIndex(levels=[[1, 2], [1, 2]], + labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) + mseries_result = Series([0, 2, 4], index=index) + + result = mseries.groupby(level=['a', 'b'], sort=False).first() + assert_series_equal(result, mseries_result) + result = mseries.groupby(level=['a', 'b'], sort=True).first() + assert_series_equal(result, mseries_result.sort_index()) + + +def test_groupby_reindex_inside_function(): + + periods = 1000 + ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) + df = DataFrame({'high': np.arange( + periods), 'low': np.arange(periods)}, index=ind) + + def agg_before(hour, func, fix=False): + """ + Run an aggregate func on the subset of data. + """ + + def _func(data): + d = data.loc[data.index.map( + lambda x: x.hour < 11)].dropna() + if fix: + data[data.index[0]] + if len(d) == 0: return None - return x.iloc[[0, -1]] - - result1 = test_df1.groupby('groups').apply(test_func) - result2 = test_df2.groupby('groups').apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], - names=['groups', None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], - names=['groups', None]) - expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, - index=index1) - expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, - index=index2) - tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) - - def test_groupby_preserves_sort(self): - # Test to ensure that groupby always preserves sort order of original - # object. Issue #8588 and #9651 - - df = DataFrame( - {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3], - 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'], - 'ints': [8, 7, 4, 5, 2, 9, 1, 1], - 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], - 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']}) - - # Try sorting on different types and with different group types - for sort_column in ['ints', 'floats', 'strings', ['ints', 'floats'], - ['ints', 'strings']]: - for group_column in ['int_groups', 'string_groups', - ['int_groups', 'string_groups']]: - - df = df.sort_values(by=sort_column) - - g = df.groupby(group_column) - - def test_sort(x): - assert_frame_equal(x, x.sort_values(by=sort_column)) - - g.apply(test_sort) - - def test_numpy_compat(self): - # see gh-12811 - df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) - g = df.groupby('A') - - msg = "numpy operations are not valid with groupby" - - for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'): - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(g, func), 1, 2, 3) - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(g, func), foo=1) - - def test_group_shift_with_null_key(self): - # This test is designed to replicate the segfault in issue #13813. - n_rows = 1200 - - # Generate a moderately large dataframe with occasional missing - # values in column `B`, and then group by [`A`, `B`]. This should - # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partially missing. - df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) - for i in range(n_rows)], dtype=float, - columns=["A", "B", "Z"], index=None) - g = df.groupby(["A", "B"]) - - expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 - else np.nan) - for i in range(n_rows)], dtype=float, - columns=["Z"], index=None) - result = g.shift(-1) - - assert_frame_equal(result, expected) - - def test_pivot_table_values_key_error(self): - # This test is designed to replicate the error in issue #14938 - df = pd.DataFrame({'eventDate': - pd.date_range(pd.datetime.today(), - periods=20, freq='M').tolist(), - 'thename': range(0, 20)}) - - df['year'] = df.set_index('eventDate').index.year - df['month'] = df.set_index('eventDate').index.month - - with pytest.raises(KeyError): - df.reset_index().pivot_table(index='year', columns='month', - values='badname', aggfunc='count') - - def test_cummin_cummax(self): - # GH 15048 - num_types = [np.int32, np.int64, np.float32, np.float64] - num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, - np.finfo(np.float32).min, np.finfo(np.float64).min] - num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, - np.finfo(np.float32).max, np.finfo(np.float64).max] - base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], - 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] - expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] - - for dtype, min_val, max_val in zip(num_types, num_mins, num_max): - df = base_df.astype(dtype) - - # cummin - expected = pd.DataFrame({'B': expected_mins}).astype(dtype) - result = df.groupby('A').cummin() - tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test cummin w/ min value for dtype - df.loc[[2, 6], 'B'] = min_val - expected.loc[[2, 3, 6, 7], 'B'] = min_val - result = df.groupby('A').cummin() - tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # cummax - expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) - result = df.groupby('A').cummax() - tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test cummax w/ max value for dtype - df.loc[[2, 6], 'B'] = max_val - expected.loc[[2, 3, 6, 7], 'B'] = max_val - result = df.groupby('A').cummax() - tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test nan in some values - base_df.loc[[0, 2, 4, 6], 'B'] = np.nan - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, - np.nan, 3, np.nan, 1]}) - result = base_df.groupby('A').cummin() - tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummin()) - .to_frame()) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, - np.nan, 3, np.nan, 3]}) - result = base_df.groupby('A').cummax() - tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummax()) - .to_frame()) - tm.assert_frame_equal(result, expected) - - # Test nan in entire column - base_df['B'] = np.nan - expected = pd.DataFrame({'B': [np.nan] * 8}) - result = base_df.groupby('A').cummin() - tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').cummax() - tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(expected, result) - - # GH 15561 - df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) - expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') - for method in ['cummax', 'cummin']: - result = getattr(df.groupby('a')['b'], method)() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) - result = df.groupby('a').b.cummax() - expected = pd.Series([2, 1, 2], name='b') - tm.assert_series_equal(result, expected) - - df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) - result = df.groupby('a').b.cummin() - expected = pd.Series([1, 2, 1], name='b') - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('in_vals, out_vals', [ - - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), - ]) - def test_is_monotonic_increasing(self, in_vals, out_vals): - # GH 17015 - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} - df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_increasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') - tm.assert_series_equal(result, expected) - - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = ( - df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('in_vals, out_vals', [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), - ]) - def test_is_monotonic_decreasing(self, in_vals, out_vals): - # GH 17015 - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} - - df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_decreasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') - tm.assert_series_equal(result, expected) - - def test_apply_numeric_coercion_when_datetime(self): - # In the past, group-by/apply operations have been over-eager - # in converting dtypes to numeric, in the presence of datetime - # columns. Various GH issues were filed, the reproductions - # for which are here. - - # GH 15670 - df = pd.DataFrame({'Number': [1, 2], - 'Date': ["2017-03-02"] * 2, - 'Str': ["foo", "inf"]}) - expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) - df.Date = pd.to_datetime(df.Date) - result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) - tm.assert_series_equal(result['Str'], expected['Str']) - - # GH 15421 - df = pd.DataFrame({'A': [10, 20, 30], - 'B': ['foo', '3', '4'], - 'T': [pd.Timestamp("12:31:22")] * 3}) - - def get_B(g): - return g.iloc[0][['B']] - result = df.groupby('A').apply(get_B)['B'] - expected = df.B - expected.index = df.A - tm.assert_series_equal(result, expected) - - # GH 14423 - def predictions(tool): - out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) - if 'step1' in list(tool.State): - out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) - if 'step2' in list(tool.State): - out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) - out['useTime'] = str( - tool[tool.State == 'step2'].oTime.values[0]) - return out - df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], - 'State': ['step1', 'step2', 'step1', 'step2'], - 'oTime': ['', '2016-09-19 05:24:33', - '', '2016-09-19 23:59:04'], - 'Machine': ['23', '36L', '36R', '36R']}) - df2 = df1.copy() - df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby('Key').apply(predictions).p1 - result = df2.groupby('Key').apply(predictions).p1 - tm.assert_series_equal(expected, result) - - def test_pipe(self): - # Test the pipe method of DataFrameGroupBy. - # Issue #17871 - - random_state = np.random.RandomState(1234567890) - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': random_state.randn(8), - 'C': random_state.randn(8)}) - - def f(dfgb): - return dfgb.B.max() - dfgb.C.min().min() - - def square(srs): - return srs ** 2 - - # Note that the transformations are - # GroupBy -> Series - # Series -> Series - # This then chains the GroupBy.pipe and the - # NDFrame.pipe methods - result = df.groupby('A').pipe(f).pipe(square) - - index = Index([u'bar', u'foo'], dtype='object', name=u'A') - expected = pd.Series([8.99110003361, 8.17516964785], name='B', - index=index) - - assert_series_equal(expected, result) - - def test_pipe_args(self): - # Test passing args to the pipe method of DataFrameGroupBy. - # Issue #17871 - - df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], - 'x': [1.0, 2.0, 3.0, 2.0, 5.0], - 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) - - def f(dfgb, arg1): - return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) - .groupby(dfgb.grouper)) - - def g(dfgb, arg2): - return dfgb.sum() / dfgb.sum().sum() + arg2 - - def h(df, arg3): - return df.x + df.y - arg3 - - result = (df - .groupby('group') - .pipe(f, 0) - .pipe(g, 10) - .pipe(h, 100)) - - # Assert the results here - index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, -80], - index=index) - - assert_series_equal(expected, result) - - # test SeriesGroupby.pipe - ser = pd.Series([1, 1, 2, 2, 3, 3]) - result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) - - expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) - - assert_series_equal(result, expected) - - def test_empty_dataframe_groupby(self): - # GH8093 - df = DataFrame(columns=['A', 'B', 'C']) - - result = df.groupby('A').sum() - expected = DataFrame(columns=['B', 'C'], dtype=np.float64) - expected.index.name = 'A' - - assert_frame_equal(result, expected) - - def test_tuple_warns(self): - # https://github.com/pandas-dev/pandas/issues/18314 - df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], - 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) - with tm.assert_produces_warning(FutureWarning) as w: - df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() - - assert "Interpreting tuple 'by' as a list" in str(w[0].message) + return func(d) + + return _func + + def afunc(data): + d = data.select(lambda x: x.hour < 11).dropna() + return np.max(d) + + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + closure_bad = grouped.agg({'high': agg_before(11, np.max)}) + closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) + + assert_frame_equal(closure_bad, closure_good) + + +def test_groupby_multiindex_missing_pair(): + # GH9049 + df = DataFrame({'group1': ['a', 'a', 'a', 'b'], + 'group2': ['c', 'c', 'd', 'c'], + 'value': [1, 1, 1, 5]}) + df = df.set_index(['group1', 'group2']) + df_grouped = df.groupby(level=['group1', 'group2'], sort=True) + + res = df_grouped.agg('sum') + idx = MultiIndex.from_tuples( + [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2']) + exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) + + tm.assert_frame_equal(res, exp) + + +def test_groupby_multiindex_not_lexsorted(): + # GH 11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns.is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], + data=[[1, 'b1', 'c1', 3], + [1, 'b2', 'c2', 4]]) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index='a', columns=['b', 'c'], values='d') + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns.is_lexsorted() + + # compare the results + tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) + + expected = lexsorted_df.groupby('a').mean() + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.groupby('a').mean() + tm.assert_frame_equal(expected, result) + + # a transforming function should work regardless of sort + # GH 14776 + df = DataFrame({'x': ['a', 'a', 'b', 'a'], + 'y': [1, 1, 2, 2], + 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) + assert not df.index.is_lexsorted() + + for level in [0, 1, [0, 1]]: + for sort in [False, True]: + result = df.groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df + tm.assert_frame_equal(expected, result) + + result = df.sort_index().groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df.sort_index() + tm.assert_frame_equal(expected, result) + + +def test_index_label_overlaps_location(): + # checking we don't have any label/location confusion in the + # the wake of GH5375 + df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + + # ... and again, with a generic Index of floats + df.index = df.index.astype(float) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + + +def test_transform_doesnt_clobber_ints(): + # GH 7972 + n = 6 + x = np.arange(n) + df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x}) + df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x}) + + gb = df.groupby('a') + result = gb.transform('mean') + + gb2 = df2.groupby('a') + expected = gb2.transform('mean') + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings', + ['ints', 'floats'], + ['ints', 'strings']]) +@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups', + ['int_groups', 'string_groups']]) +def test_groupby_preserves_sort(sort_column, group_column): + # Test to ensure that groupby always preserves sort order of original + # object. Issue #8588 and #9651 + + df = DataFrame( + {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3], + 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'], + 'ints': [8, 7, 4, 5, 2, 9, 1, 1], + 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], + 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']}) + + # Try sorting on different types and with different group types + + df = df.sort_values(by=sort_column) + g = df.groupby(group_column) + + def test_sort(x): + assert_frame_equal(x, x.sort_values(by=sort_column)) + g.apply(test_sort) + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + g = df.groupby(["A", "B"]) + + expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 + else np.nan) + for i in range(n_rows)], dtype=float, + columns=["Z"], index=None) + result = g.shift(-1) + + assert_frame_equal(result, expected) + - with tm.assert_produces_warning(None): - df.groupby(('a', 'b')).c.mean() +def test_pivot_table_values_key_error(): + # This test is designed to replicate the error in issue #14938 + df = pd.DataFrame({'eventDate': + pd.date_range(pd.datetime.today(), + periods=20, freq='M').tolist(), + 'thename': range(0, 20)}) - def test_tuple_warns_unhashable(self): - # https://github.com/pandas-dev/pandas/issues/18314 - business_dates = date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) + df['year'] = df.set_index('eventDate').index.year + df['month'] = df.set_index('eventDate').index.month + + with pytest.raises(KeyError): + df.reset_index().pivot_table(index='year', columns='month', + values='badname', aggfunc='count') - with tm.assert_produces_warning(FutureWarning) as w: - df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) - assert "Interpreting tuple 'by' as a list" in str(w[0].message) +def test_empty_dataframe_groupby(): + # GH8093 + df = DataFrame(columns=['A', 'B', 'C']) - def test_tuple_correct_keyerror(self): - # https://github.com/pandas-dev/pandas/issues/18798 - df = pd.DataFrame(1, index=range(3), - columns=pd.MultiIndex.from_product([[1, 2], - [3, 4]])) - with tm.assert_raises_regex(KeyError, "(7, 8)"): - df.groupby((7, 8)).mean() + result = df.groupby('A').sum() + expected = DataFrame(columns=['B', 'C'], dtype=np.float64) + expected.index.name = 'A' + assert_frame_equal(result, expected) -def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): - tups = lmap(tuple, df[keys].values) - tups = com._asarray_tuplesafe(tups) - expected = f(df.groupby(tups)[field]) - for k, v in compat.iteritems(expected): - assert (result[k] == v) + +def test_tuple_warns(): + # https://github.com/pandas-dev/pandas/issues/18314 + df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], + 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) + with tm.assert_produces_warning(FutureWarning) as w: + df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + with tm.assert_produces_warning(None): + df.groupby(('a', 'b')).c.mean() + + +def test_tuple_warns_unhashable(): + # https://github.com/pandas-dev/pandas/issues/18314 + business_dates = date_range(start='4/1/2014', end='6/30/2014', + freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + + with tm.assert_produces_warning(FutureWarning) as w: + df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + +def test_tuple_correct_keyerror(): + # https://github.com/pandas-dev/pandas/issues/18798 + df = pd.DataFrame(1, index=range(3), + columns=pd.MultiIndex.from_product([[1, 2], + [3, 4]])) + with tm.assert_raises_regex(KeyError, "(7, 8)"): + df.groupby((7, 8)).mean() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 57becd342d370..743237f5b386c 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -9,6 +9,7 @@ Index, MultiIndex, DataFrame, Series, CategoricalIndex) from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal) +from pandas.core.groupby.groupby import Grouping from pandas.compat import lrange, long from pandas import compat @@ -16,13 +17,12 @@ import pandas.util.testing as tm import pandas as pd -from .common import MixIn # selection # -------------------------------- -class TestSelection(MixIn): +class TestSelection(): def test_select_bad_cols(self): df = DataFrame([[1, 2]], columns=['A', 'B']) @@ -48,14 +48,14 @@ def test_groupby_duplicated_column_errormsg(self): assert c.columns.nlevels == 1 assert c.columns.size == 3 - def test_column_select_via_attr(self): - result = self.df.groupby('A').C.sum() - expected = self.df.groupby('A')['C'].sum() + def test_column_select_via_attr(self, df): + result = df.groupby('A').C.sum() + expected = df.groupby('A')['C'].sum() assert_series_equal(result, expected) - self.df['mean'] = 1.5 - result = self.df.groupby('A').mean() - expected = self.df.groupby('A').agg(np.mean) + df['mean'] = 1.5 + result = df.groupby('A').mean() + expected = df.groupby('A').agg(np.mean) assert_frame_equal(result, expected) def test_getitem_list_of_columns(self): @@ -96,7 +96,7 @@ def test_getitem_numeric_column_names(self): # grouping # -------------------------------- -class TestGrouping(MixIn): +class TestGrouping(): def test_grouper_index_types(self): # related GH5375 @@ -291,17 +291,17 @@ def test_grouper_getting_correct_binner(self): names=['one', 'two'])) assert_frame_equal(result, expected) - def test_grouper_iter(self): - assert sorted(self.df.groupby('A').grouper) == ['bar', 'foo'] + def test_grouper_iter(self, df): + assert sorted(df.groupby('A').grouper) == ['bar', 'foo'] - def test_empty_groups(self): + def test_empty_groups(self, df): # see gh-1048 - pytest.raises(ValueError, self.df.groupby, []) + pytest.raises(ValueError, df.groupby, []) - def test_groupby_grouper(self): - grouped = self.df.groupby('A') + def test_groupby_grouper(self, df): + grouped = df.groupby('A') - result = self.df.groupby(grouped.grouper).mean() + result = df.groupby(grouped.grouper).mean() expected = grouped.mean() tm.assert_frame_equal(result, expected) @@ -339,10 +339,9 @@ def test_groupby_grouper_f_sanity_checked(self): pytest.raises(AssertionError, ts.groupby, lambda key: key[0:6]) - def test_grouping_error_on_multidim_input(self): - from pandas.core.groupby.groupby import Grouping + def test_grouping_error_on_multidim_input(self, df): pytest.raises(ValueError, - Grouping, self.df.index, self.df[['A', 'A']]) + Grouping, df.index, df[['A', 'A']]) def test_multiindex_passthru(self): @@ -354,26 +353,25 @@ def test_multiindex_passthru(self): result = df.groupby(axis=1, level=[0, 1]).first() assert_frame_equal(result, df) - def test_multiindex_negative_level(self): + def test_multiindex_negative_level(self, mframe): # GH 13901 - result = self.mframe.groupby(level=-1).sum() - expected = self.mframe.groupby(level='second').sum() + result = mframe.groupby(level=-1).sum() + expected = mframe.groupby(level='second').sum() assert_frame_equal(result, expected) - result = self.mframe.groupby(level=-2).sum() - expected = self.mframe.groupby(level='first').sum() + result = mframe.groupby(level=-2).sum() + expected = mframe.groupby(level='first').sum() assert_frame_equal(result, expected) - result = self.mframe.groupby(level=[-2, -1]).sum() - expected = self.mframe + result = mframe.groupby(level=[-2, -1]).sum() + expected = mframe assert_frame_equal(result, expected) - result = self.mframe.groupby(level=[-1, 'first']).sum() - expected = self.mframe.groupby(level=['second', 'first']).sum() + result = mframe.groupby(level=[-1, 'first']).sum() + expected = mframe.groupby(level=['second', 'first']).sum() assert_frame_equal(result, expected) - def test_multifunc_select_col_integer_cols(self): - df = self.df + def test_multifunc_select_col_integer_cols(self, df): df.columns = np.arange(len(df.columns)) # it works! @@ -428,9 +426,9 @@ def test_groupby_multiindex_tuple(self): tm.assert_dict_equal(expected, result) @pytest.mark.parametrize('sort', [True, False]) - def test_groupby_level(self, sort): + def test_groupby_level(self, sort, mframe, df): # GH 17537 - frame = self.mframe + frame = mframe deleveled = frame.reset_index() result0 = frame.groupby(level=0, sort=sort).sum() @@ -464,7 +462,7 @@ def test_groupby_level(self, sort): assert_frame_equal(result1, expected1.T) # raise exception for non-MultiIndex - pytest.raises(ValueError, self.df.groupby, level=1) + pytest.raises(ValueError, df.groupby, level=1) def test_groupby_level_index_names(self): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) @@ -496,9 +494,9 @@ def test_groupby_level_with_nas(self, sort): expected = Series([6., 18.], index=[0.0, 1.0]) assert_series_equal(result, expected) - def test_groupby_args(self): + def test_groupby_args(self, mframe): # PR8618 and issue 8015 - frame = self.mframe + frame = mframe def j(): frame.groupby() @@ -516,14 +514,14 @@ def k(): [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] ]) - def test_level_preserve_order(self, sort, labels): + def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 - grouped = self.mframe.groupby(level=0, sort=sort) + grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) - def test_grouping_labels(self): - grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) + def test_grouping_labels(self, mframe): + grouped = mframe.groupby(mframe.index.get_level_values(0)) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) @@ -531,7 +529,7 @@ def test_grouping_labels(self): # get_group # -------------------------------- -class TestGetGroup(MixIn): +class TestGetGroup(): def test_get_group(self): with catch_warnings(record=True): @@ -638,29 +636,28 @@ def test_gb_key_len_equal_axis_len(self): # groups & iteration # -------------------------------- -class TestIteration(MixIn): +class TestIteration(): - def test_groups(self): - grouped = self.df.groupby(['A']) + def test_groups(self, df): + grouped = df.groupby(['A']) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k).all() + assert (df.loc[v]['A'] == k).all() - grouped = self.df.groupby(['A', 'B']) + grouped = df.groupby(['A', 'B']) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k[0]).all() - assert (self.df.loc[v]['B'] == k[1]).all() + assert (df.loc[v]['A'] == k[0]).all() + assert (df.loc[v]['B'] == k[1]).all() - def test_grouping_is_iterable(self): + def test_grouping_is_iterable(self, tsframe): # this code path isn't used anywhere else # not sure it's useful - grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year - ]) + grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year]) # test it works for g in grouped.grouper.groupings[0]: @@ -682,7 +679,7 @@ def test_multi_iter(self): assert e2 == two assert_series_equal(three, e3) - def test_multi_iter_frame(self): + def test_multi_iter_frame(self, three_group): k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) k2 = np.array(['1', '2', '1', '2', '1', '2']) df = DataFrame({'v1': np.random.randn(6), @@ -715,7 +712,7 @@ def test_multi_iter_frame(self): assert len(groups) == 2 # axis = 1 - three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() + three_levels = three_group.groupby(['A', 'B', 'C']).mean() grouped = three_levels.T.groupby(axis=1, level=(1, 2)) for key, group in grouped: pass @@ -733,13 +730,13 @@ def test_multi_iter_panel(self): expected = wp.reindex(major=exp_axis) assert_panel_equal(group, expected) - def test_dictify(self): - dict(iter(self.df.groupby('A'))) - dict(iter(self.df.groupby(['A', 'B']))) - dict(iter(self.df['C'].groupby(self.df['A']))) - dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) - dict(iter(self.df.groupby('A')['C'])) - dict(iter(self.df.groupby(['A', 'B'])['C'])) + def test_dictify(self, df): + dict(iter(df.groupby('A'))) + dict(iter(df.groupby(['A', 'B']))) + dict(iter(df['C'].groupby(df['A']))) + dict(iter(df['C'].groupby([df['A'], df['B']]))) + dict(iter(df.groupby('A')['C'])) + dict(iter(df.groupby(['A', 'B'])['C'])) def test_groupby_with_small_elem(self): # GH 8542 diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index ccde545b5b8e9..a32ba9ad76f14 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -7,314 +7,316 @@ assert_produces_warning, assert_series_equal) -from .common import MixIn - - -class TestNth(MixIn): - - def test_first_last_nth(self): - # tests for first / last / nth - grouped = self.df.groupby('A') - first = grouped.first() - expected = self.df.loc[[1, 0], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(first, expected) - - nth = grouped.nth(0) - assert_frame_equal(nth, expected) - - last = grouped.last() - expected = self.df.loc[[5, 7], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') - assert_frame_equal(last, expected) - - nth = grouped.nth(-1) - assert_frame_equal(nth, expected) - - nth = grouped.nth(1) - expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy() - expected.index = Index(['foo', 'bar'], name='A') - expected = expected.sort_index() - assert_frame_equal(nth, expected) - - # it works! - grouped['B'].first() - grouped['B'].last() - grouped['B'].nth(0) - - self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan - assert isna(grouped['B'].first()['foo']) - assert isna(grouped['B'].last()['foo']) - assert isna(grouped['B'].nth(0)['foo']) - - # v0.14.0 whatsnew - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - result = g.first() - expected = df.iloc[[1, 2]].set_index('A') - assert_frame_equal(result, expected) - - expected = df.iloc[[1, 2]].set_index('A') - result = g.nth(0, dropna='any') - assert_frame_equal(result, expected) - - def test_first_last_nth_dtypes(self): - - df = self.df_mixed_floats.copy() - df['E'] = True - df['F'] = 1 - - # tests for first / last / nth - grouped = df.groupby('A') - first = grouped.first() - expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(first, expected) - - last = grouped.last() - expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(last, expected) - - nth = grouped.nth(1) - expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(nth, expected) - - # GH 2763, first/last shifting dtypes - idx = lrange(10) - idx.append(9) - s = Series(data=lrange(11), index=idx, name='IntCol') - assert s.dtype == 'int64' - f = s.groupby(level=0).first() - assert f.dtype == 'int64' - - def test_nth(self): - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) - assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) - assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) - assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) - assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) - assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) - assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) - assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) - assert_frame_equal(g[['B']].nth(0), - df.loc[[0, 2], ['A', 'B']].set_index('A')) - - exp = df.set_index('A') - assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) - - exp['B'] = np.nan - assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) - - # out of bounds, regression from 0.13.1 - # GH 6621 - df = DataFrame({'color': {0: 'green', - 1: 'green', - 2: 'red', - 3: 'red', - 4: 'red'}, - 'food': {0: 'ham', - 1: 'eggs', - 2: 'eggs', - 3: 'ham', - 4: 'pork'}, - 'two': {0: 1.5456590000000001, - 1: -0.070345000000000005, - 2: -2.4004539999999999, - 3: 0.46206000000000003, - 4: 0.52350799999999997}, - 'one': {0: 0.56573799999999996, - 1: -0.9742360000000001, - 2: 1.033801, - 3: -0.78543499999999999, - 4: 0.70422799999999997}}).set_index(['color', - 'food']) - - result = df.groupby(level=0, as_index=False).nth(2) - expected = df.iloc[[-1]] - assert_frame_equal(result, expected) - - result = df.groupby(level=0, as_index=False).nth(3) - expected = df.loc[[]] - assert_frame_equal(result, expected) - - # GH 7559 - # from the vbench - df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') - s = df[1] - g = df[0] - expected = s.groupby(g).first() - expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) - assert_series_equal(expected2, expected, check_names=False) - assert expected.name == 1 - assert expected2.name == 1 - - # validate first - v = s[g == 1].iloc[0] - assert expected.iloc[0] == v - assert expected2.iloc[0] == v - - # this is NOT the same as .first (as sorted is default!) - # as it keeps the order in the series (and not the group order) - # related GH 7287 - expected = s.groupby(g, sort=False).first() - result = s.groupby(g, sort=False).nth(0, dropna='all') - assert_series_equal(result, expected) - - # doc example - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - # PR 17493, related to issue 11038 - # test Series.nth with True for dropna produces FutureWarning - with assert_produces_warning(FutureWarning): - result = g.B.nth(0, dropna=True) - expected = g.B.first() - assert_series_equal(result, expected) - - # test multiple nth values - df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], - columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) - assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) - - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) - # get the first, fourth and last two business days for each month - key = [df.index.year, df.index.month] - result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) - expected_dates = pd.to_datetime( - ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', - '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', - '2014/6/27', '2014/6/30']) - expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) - assert_frame_equal(result, expected) - - def test_nth_multi_index(self): - # PR 9090, related to issue 8979 - # test nth on MultiIndex, should match .first() - grouped = self.three_group.groupby(['A', 'B']) - result = grouped.nth(0) - expected = grouped.first() - assert_frame_equal(result, expected) - - def test_nth_multi_index_as_expected(self): - # PR 9090, related to issue 8979 - # test nth on MultiIndex - three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny']}) - grouped = three_group.groupby(['A', 'B']) - result = grouped.nth(0) - expected = DataFrame( - {'C': ['dull', 'dull', 'dull', 'dull']}, - index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], - ['one', 'two', 'one', 'two']], - names=['A', 'B'])) - assert_frame_equal(result, expected) - - def test_groupby_head_tail(self): - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g_as = df.groupby('A', as_index=True) - g_not_as = df.groupby('A', as_index=False) - - # as_index= False, much easier - assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) - assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - - empty_not_as = DataFrame(columns=df.columns, - index=pd.Index([], dtype=df.index.dtype)) - empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_not_as, g_not_as.head(0)) - assert_frame_equal(empty_not_as, g_not_as.tail(0)) - assert_frame_equal(empty_not_as, g_not_as.head(-1)) - assert_frame_equal(empty_not_as, g_not_as.tail(-1)) - - assert_frame_equal(df, g_not_as.head(7)) # contains all - assert_frame_equal(df, g_not_as.tail(7)) - - # as_index=True, (used to be different) - df_as = df - - assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) - assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) - - empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) - empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_as, g_as.head(0)) - assert_frame_equal(empty_as, g_as.tail(0)) - assert_frame_equal(empty_as, g_as.head(-1)) - assert_frame_equal(empty_as, g_as.tail(-1)) - - assert_frame_equal(df_as, g_as.head(7)) # contains all - assert_frame_equal(df_as, g_as.tail(7)) - - # test with selection - assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - - assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - - def test_group_selection_cache(self): - # GH 12839 nth, head, and tail should return same result consistently - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - expected = df.iloc[[0, 2]].set_index('A') - - g = df.groupby('A') - result1 = g.head(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.tail(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.head(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.tail(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) + +def test_first_last_nth(df): + # tests for first / last / nth + grouped = df.groupby('A') + first = grouped.first() + expected = df.loc[[1, 0], ['B', 'C', 'D']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + nth = grouped.nth(0) + assert_frame_equal(nth, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ['B', 'C', 'D']] + expected.index = Index(['bar', 'foo'], name='A') + assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + assert_frame_equal(nth, expected) + + nth = grouped.nth(1) + expected = df.loc[[2, 3], ['B', 'C', 'D']].copy() + expected.index = Index(['foo', 'bar'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # it works! + grouped['B'].first() + grouped['B'].last() + grouped['B'].nth(0) + + df.loc[df['A'] == 'foo', 'B'] = np.nan + assert isna(grouped['B'].first()['foo']) + assert isna(grouped['B'].last()['foo']) + assert isna(grouped['B'].nth(0)['foo']) + + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + result = g.first() + expected = df.iloc[[1, 2]].set_index('A') + assert_frame_equal(result, expected) + + expected = df.iloc[[1, 2]].set_index('A') + result = g.nth(0, dropna='any') + assert_frame_equal(result, expected) + + +def test_first_last_nth_dtypes(df_mixed_floats): + + df = df_mixed_floats.copy() + df['E'] = True + df['F'] = 1 + + # tests for first / last / nth + grouped = df.groupby('A') + first = grouped.first() + expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # GH 2763, first/last shifting dtypes + idx = lrange(10) + idx.append(9) + s = Series(data=lrange(11), index=idx, name='IntCol') + assert s.dtype == 'int64' + f = s.groupby(level=0).first() + assert f.dtype == 'int64' + + +def test_nth(): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) + assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) + assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) + assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) + assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) + assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) + assert_frame_equal(g[['B']].nth(0), + df.loc[[0, 2], ['A', 'B']].set_index('A')) + + exp = df.set_index('A') + assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) + + exp['B'] = np.nan + assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + + # out of bounds, regression from 0.13.1 + # GH 6621 + df = DataFrame({'color': {0: 'green', + 1: 'green', + 2: 'red', + 3: 'red', + 4: 'red'}, + 'food': {0: 'ham', + 1: 'eggs', + 2: 'eggs', + 3: 'ham', + 4: 'pork'}, + 'two': {0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997}, + 'one': {0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997}}).set_index(['color', + 'food']) + + result = df.groupby(level=0, as_index=False).nth(2) + expected = df.iloc[[-1]] + assert_frame_equal(result, expected) + + result = df.groupby(level=0, as_index=False).nth(3) + expected = df.loc[[]] + assert_frame_equal(result, expected) + + # GH 7559 + # from the vbench + df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') + s = df[1] + g = df[0] + expected = s.groupby(g).first() + expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) + assert_series_equal(expected2, expected, check_names=False) + assert expected.name == 1 + assert expected2.name == 1 + + # validate first + v = s[g == 1].iloc[0] + assert expected.iloc[0] == v + assert expected2.iloc[0] == v + + # this is NOT the same as .first (as sorted is default!) + # as it keeps the order in the series (and not the group order) + # related GH 7287 + expected = s.groupby(g, sort=False).first() + result = s.groupby(g, sort=False).nth(0, dropna='all') + assert_series_equal(result, expected) + + # doc example + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # PR 17493, related to issue 11038 + # test Series.nth with True for dropna produces FutureWarning + with assert_produces_warning(FutureWarning): + result = g.B.nth(0, dropna=True) + expected = g.B.first() + assert_series_equal(result, expected) + + # test multiple nth values + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], + columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) + assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) + + business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', + freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + # get the first, fourth and last two business days for each month + key = [df.index.year, df.index.month] + result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) + expected_dates = pd.to_datetime( + ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', + '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', + '2014/6/27', '2014/6/30']) + expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) + assert_frame_equal(result, expected) + + +def test_nth_multi_index(three_group): + # PR 9090, related to issue 8979 + # test nth on MultiIndex, should match .first() + grouped = three_group.groupby(['A', 'B']) + result = grouped.nth(0) + expected = grouped.first() + assert_frame_equal(result, expected) + + +def test_nth_multi_index_as_expected(): + # PR 9090, related to issue 8979 + # test nth on MultiIndex + three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny']}) + grouped = three_group.groupby(['A', 'B']) + result = grouped.nth(0) + expected = DataFrame( + {'C': ['dull', 'dull', 'dull', 'dull']}, + index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], + ['one', 'two', 'one', 'two']], + names=['A', 'B'])) + assert_frame_equal(result, expected) + + +def test_groupby_head_tail(): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False, much easier + assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) + + empty_not_as = DataFrame(columns=df.columns, + index=pd.Index([], dtype=df.index.dtype)) + empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_not_as, g_not_as.head(0)) + assert_frame_equal(empty_not_as, g_not_as.tail(0)) + assert_frame_equal(empty_not_as, g_not_as.head(-1)) + assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + assert_frame_equal(df, g_not_as.head(7)) # contains all + assert_frame_equal(df, g_not_as.tail(7)) + + # as_index=True, (used to be different) + df_as = df + + assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_as, g_as.head(0)) + assert_frame_equal(empty_as, g_as.tail(0)) + assert_frame_equal(empty_as, g_as.head(-1)) + assert_frame_equal(empty_as, g_as.tail(-1)) + + assert_frame_equal(df_as, g_as.head(7)) # contains all + assert_frame_equal(df_as, g_as.tail(7)) + + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + +def test_group_selection_cache(): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + expected = df.iloc[[0, 2]].set_index('A') + + g = df.groupby('A') + result1 = g.head(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.tail(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.head(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.tail(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) def test_nth_empty(): diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py new file mode 100644 index 0000000000000..6ad8b4905abff --- /dev/null +++ b/pandas/tests/groupby/test_rank.py @@ -0,0 +1,254 @@ +import pytest +import numpy as np +import pandas as pd +from pandas import DataFrame, concat +from pandas.util import testing as tm + + +def test_rank_apply(): + lev1 = tm.rands_array(10, 100) + lev2 = tm.rands_array(10, 130) + lab1 = np.random.randint(0, 100, size=500) + lab2 = np.random.randint(0, 130, size=500) + + df = DataFrame({'value': np.random.randn(500), + 'key1': lev1.take(lab1), + 'key2': lev2.take(lab2)}) + + result = df.groupby(['key1', 'key2']).value.rank() + + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank()) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + result = df.groupby(['key1', 'key2']).value.rank(pct=True) + + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank(pct=True)) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) +@pytest.mark.parametrize("vals", [ + [2, 2, 8, 2, 6], + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06')]]) +@pytest.mark.parametrize("ties_method,ascending,pct,exp", [ + ('average', True, False, [2., 2., 5., 2., 4.]), + ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ('average', False, False, [4., 4., 1., 4., 2.]), + ('average', False, True, [.8, .8, .2, .8, .4]), + ('min', True, False, [1., 1., 5., 1., 4.]), + ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ('min', False, False, [3., 3., 1., 3., 2.]), + ('min', False, True, [.6, .6, .2, .6, .4]), + ('max', True, False, [3., 3., 5., 3., 4.]), + ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ('max', False, False, [5., 5., 1., 5., 2.]), + ('max', False, True, [1., 1., .2, 1., .4]), + ('first', True, False, [1., 2., 5., 3., 4.]), + ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ('first', False, False, [3., 4., 1., 5., 2.]), + ('first', False, True, [.6, .8, .2, 1., .4]), + ('dense', True, False, [1., 1., 3., 1., 2.]), + ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', False, False, [3., 3., 1., 3., 2.]), + ('dense', False, True, [.6, .6, .2, .6, .4]), +]) +def test_rank_args(grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) +@pytest.mark.parametrize("vals", [ + [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], +]) +@pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ + ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), + ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), + ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), + ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), + ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), + ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), + ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), + ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), + ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), + ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), + ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), + ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), + ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), + ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), + ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), + ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), + ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), + ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), + ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), + ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), + ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), + ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), + ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), + ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), + ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), + ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), + ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), + ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) +]) +def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): + # GH 20561 + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option) + exp_df = DataFrame(exp * len(grps), columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) +@pytest.mark.parametrize("vals", [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06'), np.nan, np.nan] +]) +@pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ + ('average', True, 'keep', False, + [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), + ('average', True, 'keep', True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), + ('average', False, 'keep', False, + [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), + ('average', False, 'keep', True, + [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), + ('min', True, 'keep', False, + [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), + ('min', True, 'keep', True, + [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ('min', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('min', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('max', True, 'keep', False, + [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), + ('max', True, 'keep', True, + [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('max', False, 'keep', False, + [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), + ('max', False, 'keep', True, + [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('first', True, 'keep', False, + [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), + ('first', True, 'keep', True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('first', False, 'keep', False, + [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), + ('first', False, 'keep', True, + [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('dense', True, 'keep', False, + [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), + ('dense', True, 'keep', True, + [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + ('dense', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('dense', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), + ('average', True, 'no_na', True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), + ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), + ('average', False, 'no_na', True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), + ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), + ('min', True, 'no_na', True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), + ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), + ('min', False, 'no_na', True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), + ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), + ('max', True, 'no_na', True, + [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), + ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), + ('max', False, 'no_na', True, + [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), + ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), + ('first', True, 'no_na', True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), + ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), + ('first', False, 'no_na', True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), + ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), + ('dense', True, 'no_na', True, + [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), + ('dense', False, 'no_na', True, + [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) +]) +def test_rank_args_missing(grps, vals, ties_method, ascending, + na_option, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("pct,exp", [ + (False, [3., 3., 3., 3., 3.]), + (True, [.6, .6, .6, .6, .6])]) +def test_rank_resets_each_group(pct, exp): + df = DataFrame( + {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], + 'val': [1] * 10} + ) + result = df.groupby('key').rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +def test_rank_avg_even_vals(): + df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) + result = df.groupby('key').rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("ties_method", [ + 'average', 'min', 'max', 'first', 'dense']) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize("vals", [ + ['bar', 'bar', 'foo', 'bar', 'baz'], + ['bar', np.nan, 'foo', np.nan, 'baz'] +]) +def test_rank_object_raises(ties_method, ascending, na_option, + pct, vals): + df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + with tm.assert_raises_regex(TypeError, "not callable"): + df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 390b99d0fab1c..626057c1ea760 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -10,728 +10,758 @@ _ensure_platform_int, is_timedelta64_dtype) from pandas.compat import StringIO from pandas._libs import groupby -from .common import MixIn, assert_fp_equal from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.groupby.groupby import DataError from pandas.core.config import option_context -class TestGroupBy(MixIn): - - def test_transform(self): - data = Series(np.arange(9) // 3, index=np.arange(9)) - - index = np.arange(9) - np.random.shuffle(index) - data = data.reindex(index) - - grouped = data.groupby(lambda x: x // 3) - - transformed = grouped.transform(lambda x: x * x.sum()) - assert transformed[7] == 12 - - # GH 8046 - # make sure that we preserve the input order - - df = DataFrame( - np.arange(6, dtype='int64').reshape( - 3, 2), columns=["a", "b"], index=[0, 2, 1]) - key = [0, 0, 1] - expected = df.sort_index().groupby(key).transform( - lambda x: x - x.mean()).groupby(key).mean() - result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( - key).mean() - assert_frame_equal(result, expected) - - def demean(arr): - return arr - arr.mean() - - people = DataFrame(np.random.randn(5, 5), - columns=['a', 'b', 'c', 'd', 'e'], - index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) - key = ['one', 'two', 'one', 'two', 'one'] - result = people.groupby(key).transform(demean).groupby(key).mean() - expected = people.groupby(key).apply(demean).groupby(key).mean() - assert_frame_equal(result, expected) - - # GH 8430 - df = tm.makeTimeDataFrame() - g = df.groupby(pd.Grouper(freq='M')) - g.transform(lambda x: x - 1) - - # GH 9700 - df = DataFrame({'a': range(5, 10), 'b': range(5)}) - result = df.groupby('a').transform(max) - expected = DataFrame({'b': range(5)}) - tm.assert_frame_equal(result, expected) - - def test_transform_fast(self): - - df = DataFrame({'id': np.arange(100000) / 3, - 'val': np.random.randn(100000)}) - - grp = df.groupby('id')['val'] - - values = np.repeat(grp.mean().values, - _ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index, name='val') - - result = grp.transform(np.mean) - assert_series_equal(result, expected) - - result = grp.transform('mean') - assert_series_equal(result, expected) - - # GH 12737 - df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], - 'd': pd.date_range('2014-1-1', '2014-1-4'), - 'i': [1, 2, 3, 4]}, - columns=['grouping', 'f', 'i', 'd']) - result = df.groupby('grouping').transform('first') - - dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] - expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], - 'd': dates, - 'i': [1, 2, 2, 4]}, - columns=['f', 'i', 'd']) - assert_frame_equal(result, expected) - - # selection - result = df.groupby('grouping')[['f', 'i']].transform('first') - expected = expected[['f', 'i']] - assert_frame_equal(result, expected) - - # dup columns - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) - result = df.groupby('g').transform('first') - expected = df.drop('g', axis=1) - assert_frame_equal(result, expected) - - def test_transform_broadcast(self): - grouped = self.ts.groupby(lambda x: x.month) - result = grouped.transform(np.mean) - - tm.assert_index_equal(result.index, self.ts.index) - for _, gp in grouped: - assert_fp_equal(result.reindex(gp.index), gp.mean()) - - grouped = self.tsframe.groupby(lambda x: x.month) - result = grouped.transform(np.mean) - tm.assert_index_equal(result.index, self.tsframe.index) - for _, gp in grouped: - agged = gp.mean() - res = result.reindex(gp.index) - for col in self.tsframe: - assert_fp_equal(res[col], agged[col]) - - # group columns - grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, - axis=1) - result = grouped.transform(np.mean) - tm.assert_index_equal(result.index, self.tsframe.index) - tm.assert_index_equal(result.columns, self.tsframe.columns) - for _, gp in grouped: - agged = gp.mean(1) - res = result.reindex(columns=gp.columns) - for idx in gp.index: - assert_fp_equal(res.xs(idx), agged[idx]) - - def test_transform_axis(self): - - # make sure that we are setting the axes - # correctly when on axis=0 or 1 - # in the presence of a non-monotonic indexer - # GH12713 - - base = self.tsframe.iloc[0:5] - r = len(base.index) - c = len(base.columns) - tso = DataFrame(np.random.randn(r, c), - index=base.index, - columns=base.columns, - dtype='float64') - # monotonic - ts = tso - grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: x - x.mean()) - assert_frame_equal(result, expected) - - ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - assert_frame_equal(result, expected) - - # non-monotonic - ts = tso.iloc[[1, 0] + list(range(2, len(base)))] - grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: x - x.mean()) - assert_frame_equal(result, expected) - - ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - assert_frame_equal(result, expected) - - def test_transform_dtype(self): - # GH 9807 - # Check transform dtype output is preserved - df = DataFrame([[1, 3], [2, 3]]) - result = df.groupby(1).transform('mean') - expected = DataFrame([[1.5], [1.5]]) - assert_frame_equal(result, expected) - - def test_transform_bug(self): - # GH 5712 - # transforming on a datetime column - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - result = df.groupby('A')['B'].transform( - lambda x: x.rank(ascending=False)) - expected = Series(np.arange(5, 0, step=-1), name='B') - assert_series_equal(result, expected) - - def test_transform_numeric_to_boolean(self): - # GH 16875 - # inconsistency in transforming boolean values - expected = pd.Series([True, True], name='A') - - df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) - assert_series_equal(result, expected) - - df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) - assert_series_equal(result, expected) - - def test_transform_datetime_to_timedelta(self): - # GH 15429 - # transforming a datetime to timedelta - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - expected = pd.Series([ - Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') - - # this does date math without changing result type in transform - base_time = df['A'][0] - result = df.groupby('A')['A'].transform( - lambda x: x.max() - x.min() + base_time) - base_time - assert_series_equal(result, expected) - - # this does date math and causes the transform to return timedelta - result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) - assert_series_equal(result, expected) - - def test_transform_datetime_to_numeric(self): - # GH 10972 - # convert dt to float - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) - - expected = Series([-0.5, 0.5], name='b') - assert_series_equal(result, expected) - - # convert dt to int - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) - - expected = Series([0, 1], name='b') - assert_series_equal(result, expected) - - def test_transform_casting(self): - # 13046 - data = """ - idx A ID3 DATETIME - 0 B-028 b76cd912ff "2014-10-08 13:43:27" - 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" - 2 B-076 1a682034f8 "2014-10-08 14:29:01" - 3 B-023 b76cd912ff "2014-10-08 18:39:34" - 4 B-023 f88g8d7sds "2014-10-08 18:40:18" - 5 B-033 b76cd912ff "2014-10-08 18:44:30" - 6 B-032 b76cd912ff "2014-10-08 18:46:00" - 7 B-037 b76cd912ff "2014-10-08 18:52:15" - 8 B-046 db959faf02 "2014-10-08 18:59:59" - 9 B-053 b76cd912ff "2014-10-08 19:17:48" - 10 B-065 b76cd912ff "2014-10-08 19:21:38" - """ - df = pd.read_csv(StringIO(data), sep=r'\s+', - index_col=[0], parse_dates=['DATETIME']) - - result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) - assert is_timedelta64_dtype(result.dtype) - - result = df[['ID3', 'DATETIME']].groupby('ID3').transform( - lambda x: x.diff()) - assert is_timedelta64_dtype(result.DATETIME.dtype) - - def test_transform_multiple(self): - grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) - - grouped.transform(lambda x: x * 2) - grouped.transform(np.mean) - - def test_dispatch_transform(self): - df = self.tsframe[::5].reindex(self.tsframe.index) - - grouped = df.groupby(lambda x: x.month) - - filled = grouped.fillna(method='pad') - fillit = lambda x: x.fillna(method='pad') - expected = df.groupby(lambda x: x.month).transform(fillit) - assert_frame_equal(filled, expected) - - def test_transform_select_columns(self): - f = lambda x: x.mean() - result = self.df.groupby('A')['C', 'D'].transform(f) - - selection = self.df[['C', 'D']] - expected = selection.groupby(self.df['A']).transform(f) - - assert_frame_equal(result, expected) - - def test_transform_exclude_nuisance(self): - - # this also tests orderings in transform between - # series/frame to make sure it's consistent - expected = {} - grouped = self.df.groupby('A') - expected['C'] = grouped['C'].transform(np.mean) - expected['D'] = grouped['D'].transform(np.mean) - expected = DataFrame(expected) - result = self.df.groupby('A').transform(np.mean) - - assert_frame_equal(result, expected) - - def test_transform_function_aliases(self): - result = self.df.groupby('A').transform('mean') - expected = self.df.groupby('A').transform(np.mean) - assert_frame_equal(result, expected) - - result = self.df.groupby('A')['C'].transform('mean') - expected = self.df.groupby('A')['C'].transform(np.mean) - assert_series_equal(result, expected) - - def test_series_fast_transform_date(self): - # GH 13191 - df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], - 'd': pd.date_range('2014-1-1', '2014-1-4')}) - result = df.groupby('grouping')['d'].transform('first') - dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-4')] - expected = pd.Series(dates, name='d') - assert_series_equal(result, expected) - - def test_transform_length(self): - # GH 9697 - df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) - expected = pd.Series([3.0] * 4) - - def nsum(x): - return np.nansum(x) - - results = [df.groupby('col1').transform(sum)['col2'], - df.groupby('col1')['col2'].transform(sum), - df.groupby('col1').transform(nsum)['col2'], - df.groupby('col1')['col2'].transform(nsum)] - for result in results: - assert_series_equal(result, expected, check_names=False) - - def test_transform_coercion(self): - - # 14457 - # when we are transforming be sure to not coerce - # via assignment - df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) - g = df.groupby('A') - - expected = g.transform(np.mean) - result = g.transform(lambda x: np.mean(x)) - assert_frame_equal(result, expected) - - def test_groupby_transform_with_int(self): - - # GH 3740, make sure that we might upcast on item-by-item transform - - # floats - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), - C=Series( - [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=Series( - [-1, 0, 1, -1, 0, 1], dtype='float64'))) - assert_frame_equal(result, expected) - - # int case - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, - C=[1, 2, 3, 1, 2, 3], D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) - assert_frame_equal(result, expected) - - # int that needs float conversion - s = Series([2, 3, 4, 10, 5, -1]) - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - - s1 = s.iloc[0:3] - s1 = (s1 - s1.mean()) / s1.std() - s2 = s.iloc[3:6] - s2 = (s2 - s2.mean()) / s2.std() - expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) - assert_frame_equal(result, expected) - - # int downcasting - result = df.groupby('A').transform(lambda x: x * 2 / 2) - expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) - assert_frame_equal(result, expected) - - def test_groupby_transform_with_nan_group(self): - # GH 9941 - df = pd.DataFrame({'a': range(10), - 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) - result = df.groupby(df.b)['a'].transform(max) - expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], - name='a') - assert_series_equal(result, expected) - - def test_transform_mixed_type(self): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) - - def f(group): - group['g'] = group['d'] * 2 - return group[:1] - - grouped = df.groupby('c') - result = grouped.apply(f) - - assert result['d'].dtype == np.float64 - - # this is by definition a mutating operation! - with option_context('mode.chained_assignment', None): - for key, group in grouped: - res = f(group) - assert_frame_equal(res, result.loc[key]) - - def test_cython_group_transform_algos(self): - # GH 4095 - dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, - np.uint64, np.float32, np.float64] - - ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]), - (groupby.group_cumsum, np.cumsum, dtypes)] - - is_datetimelike = False - for pd_op, np_op, dtypes in ops: - for dtype in dtypes: - data = np.array([[1], [2], [3], [4]], dtype=dtype) - ans = np.zeros_like(data) - labels = np.array([0, 0, 0, 0], dtype=np.int64) - pd_op(ans, data, labels, is_datetimelike) - tm.assert_numpy_array_equal(np_op(data), ans[:, 0], - check_dtype=False) - - # with nans - labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) - - data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') - actual = np.zeros_like(data) - actual.fill(np.nan) - groupby.group_cumprod_float64(actual, data, labels, is_datetimelike) - expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') - tm.assert_numpy_array_equal(actual[:, 0], expected) - - actual = np.zeros_like(data) - actual.fill(np.nan) - groupby.group_cumsum(actual, data, labels, is_datetimelike) - expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') - tm.assert_numpy_array_equal(actual[:, 0], expected) - - # timedelta - is_datetimelike = True - data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - actual = np.zeros_like(data, dtype='int64') - groupby.group_cumsum(actual, data.view('int64'), labels, - is_datetimelike) - expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( - 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), - np.timedelta64(5, 'ns')]) - tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) - - @pytest.mark.parametrize( - "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) - def test_cython_transform_series(self, op, args, targop): - # GH 4095 - s = Series(np.random.randn(1000)) - s_missing = s.copy() - s_missing.iloc[2:10] = np.nan - labels = np.random.randint(0, 50, size=1000).astype(float) - - # series - for data in [s, s_missing]: - # print(data.head()) - expected = data.groupby(labels).transform(targop) - - tm.assert_series_equal( +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def test_transform(): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + + # GH 8046 + # make sure that we preserve the input order + + df = DataFrame( + np.arange(6, dtype='int64').reshape( + 3, 2), columns=["a", "b"], index=[0, 2, 1]) + key = [0, 0, 1] + expected = df.sort_index().groupby(key).transform( + lambda x: x - x.mean()).groupby(key).mean() + result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( + key).mean() + assert_frame_equal(result, expected) + + def demean(arr): + return arr - arr.mean() + + people = DataFrame(np.random.randn(5, 5), + columns=['a', 'b', 'c', 'd', 'e'], + index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) + key = ['one', 'two', 'one', 'two', 'one'] + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key).apply(demean).groupby(key).mean() + assert_frame_equal(result, expected) + + # GH 8430 + df = tm.makeTimeDataFrame() + g = df.groupby(pd.Grouper(freq='M')) + g.transform(lambda x: x - 1) + + # GH 9700 + df = DataFrame({'a': range(5, 10), 'b': range(5)}) + result = df.groupby('a').transform(max) + expected = DataFrame({'b': range(5)}) + tm.assert_frame_equal(result, expected) + + +def test_transform_fast(): + + df = DataFrame({'id': np.arange(100000) / 3, + 'val': np.random.randn(100000)}) + + grp = df.groupby('id')['val'] + + values = np.repeat(grp.mean().values, + _ensure_platform_int(grp.count().values)) + expected = pd.Series(values, index=df.index, name='val') + + result = grp.transform(np.mean) + assert_series_equal(result, expected) + + result = grp.transform('mean') + assert_series_equal(result, expected) + + # GH 12737 + df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], + 'd': pd.date_range('2014-1-1', '2014-1-4'), + 'i': [1, 2, 3, 4]}, + columns=['grouping', 'f', 'i', 'd']) + result = df.groupby('grouping').transform('first') + + dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] + expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], + 'd': dates, + 'i': [1, 2, 2, 4]}, + columns=['f', 'i', 'd']) + assert_frame_equal(result, expected) + + # selection + result = df.groupby('grouping')[['f', 'i']].transform('first') + expected = expected[['f', 'i']] + assert_frame_equal(result, expected) + + # dup columns + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) + result = df.groupby('g').transform('first') + expected = df.drop('g', axis=1) + assert_frame_equal(result, expected) + + +def test_transform_broadcast(tsframe, ts): + grouped = ts.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + + tm.assert_index_equal(result.index, ts.index) + for _, gp in grouped: + assert_fp_equal(result.reindex(gp.index), gp.mean()) + + grouped = tsframe.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + for _, gp in grouped: + agged = gp.mean() + res = result.reindex(gp.index) + for col in tsframe: + assert_fp_equal(res[col], agged[col]) + + # group columns + grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, + axis=1) + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + tm.assert_index_equal(result.columns, tsframe.columns) + for _, gp in grouped: + agged = gp.mean(1) + res = result.reindex(columns=gp.columns) + for idx in gp.index: + assert_fp_equal(res.xs(idx), agged[idx]) + + +def test_transform_axis(tsframe): + + # make sure that we are setting the axes + # correctly when on axis=0 or 1 + # in the presence of a non-monotonic indexer + # GH12713 + + base = tsframe.iloc[0:5] + r = len(base.index) + c = len(base.columns) + tso = DataFrame(np.random.randn(r, c), + index=base.index, + columns=base.columns, + dtype='float64') + # monotonic + ts = tso + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: x - x.mean()) + assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + assert_frame_equal(result, expected) + + # non-monotonic + ts = tso.iloc[[1, 0] + list(range(2, len(base)))] + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: x - x.mean()) + assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + assert_frame_equal(result, expected) + + +def test_transform_dtype(): + # GH 9807 + # Check transform dtype output is preserved + df = DataFrame([[1, 3], [2, 3]]) + result = df.groupby(1).transform('mean') + expected = DataFrame([[1.5], [1.5]]) + assert_frame_equal(result, expected) + + +def test_transform_bug(): + # GH 5712 + # transforming on a datetime column + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + result = df.groupby('A')['B'].transform( + lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5, 0, step=-1), name='B') + assert_series_equal(result, expected) + + +def test_transform_numeric_to_boolean(): + # GH 16875 + # inconsistency in transforming boolean values + expected = pd.Series([True, True], name='A') + + df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + + df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + + +def test_transform_datetime_to_timedelta(): + # GH 15429 + # transforming a datetime to timedelta + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + expected = pd.Series([ + Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') + + # this does date math without changing result type in transform + base_time = df['A'][0] + result = df.groupby('A')['A'].transform( + lambda x: x.max() - x.min() + base_time) - base_time + assert_series_equal(result, expected) + + # this does date math and causes the transform to return timedelta + result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) + assert_series_equal(result, expected) + + +def test_transform_datetime_to_numeric(): + # GH 10972 + # convert dt to float + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) + + expected = Series([-0.5, 0.5], name='b') + assert_series_equal(result, expected) + + # convert dt to int + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) + + expected = Series([0, 1], name='b') + assert_series_equal(result, expected) + + +def test_transform_casting(): + # 13046 + data = """ + idx A ID3 DATETIME + 0 B-028 b76cd912ff "2014-10-08 13:43:27" + 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" + 2 B-076 1a682034f8 "2014-10-08 14:29:01" + 3 B-023 b76cd912ff "2014-10-08 18:39:34" + 4 B-023 f88g8d7sds "2014-10-08 18:40:18" + 5 B-033 b76cd912ff "2014-10-08 18:44:30" + 6 B-032 b76cd912ff "2014-10-08 18:46:00" + 7 B-037 b76cd912ff "2014-10-08 18:52:15" + 8 B-046 db959faf02 "2014-10-08 18:59:59" + 9 B-053 b76cd912ff "2014-10-08 19:17:48" + 10 B-065 b76cd912ff "2014-10-08 19:21:38" + """ + df = pd.read_csv(StringIO(data), sep=r'\s+', + index_col=[0], parse_dates=['DATETIME']) + + result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) + assert is_timedelta64_dtype(result.dtype) + + result = df[['ID3', 'DATETIME']].groupby('ID3').transform( + lambda x: x.diff()) + assert is_timedelta64_dtype(result.DATETIME.dtype) + + +def test_transform_multiple(ts): + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + grouped.transform(lambda x: x * 2) + grouped.transform(np.mean) + + +def test_dispatch_transform(tsframe): + df = tsframe[::5].reindex(tsframe.index) + + grouped = df.groupby(lambda x: x.month) + + filled = grouped.fillna(method='pad') + fillit = lambda x: x.fillna(method='pad') + expected = df.groupby(lambda x: x.month).transform(fillit) + assert_frame_equal(filled, expected) + + +def test_transform_select_columns(df): + f = lambda x: x.mean() + result = df.groupby('A')['C', 'D'].transform(f) + + selection = df[['C', 'D']] + expected = selection.groupby(df['A']).transform(f) + + assert_frame_equal(result, expected) + + +def test_transform_exclude_nuisance(df): + + # this also tests orderings in transform between + # series/frame to make sure it's consistent + expected = {} + grouped = df.groupby('A') + expected['C'] = grouped['C'].transform(np.mean) + expected['D'] = grouped['D'].transform(np.mean) + expected = DataFrame(expected) + result = df.groupby('A').transform(np.mean) + + assert_frame_equal(result, expected) + + +def test_transform_function_aliases(df): + result = df.groupby('A').transform('mean') + expected = df.groupby('A').transform(np.mean) + assert_frame_equal(result, expected) + + result = df.groupby('A')['C'].transform('mean') + expected = df.groupby('A')['C'].transform(np.mean) + assert_series_equal(result, expected) + + +def test_series_fast_transform_date(): + # GH 13191 + df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], + 'd': pd.date_range('2014-1-1', '2014-1-4')}) + result = df.groupby('grouping')['d'].transform('first') + dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-4')] + expected = pd.Series(dates, name='d') + assert_series_equal(result, expected) + + +def test_transform_length(): + # GH 9697 + df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) + expected = pd.Series([3.0] * 4) + + def nsum(x): + return np.nansum(x) + + results = [df.groupby('col1').transform(sum)['col2'], + df.groupby('col1')['col2'].transform(sum), + df.groupby('col1').transform(nsum)['col2'], + df.groupby('col1')['col2'].transform(nsum)] + for result in results: + assert_series_equal(result, expected, check_names=False) + + +def test_transform_coercion(): + + # 14457 + # when we are transforming be sure to not coerce + # via assignment + df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) + g = df.groupby('A') + + expected = g.transform(np.mean) + result = g.transform(lambda x: np.mean(x)) + assert_frame_equal(result, expected) + + +def test_groupby_transform_with_int(): + + # GH 3740, make sure that we might upcast on item-by-item transform + + # floats + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), + C=Series( + [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + expected = DataFrame(dict(B=np.nan, C=Series( + [-1, 0, 1, -1, 0, 1], dtype='float64'))) + assert_frame_equal(result, expected) + + # int case + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, + C=[1, 2, 3, 1, 2, 3], D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) + assert_frame_equal(result, expected) + + # int that needs float conversion + s = Series([2, 3, 4, 10, 5, -1]) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + + s1 = s.iloc[0:3] + s1 = (s1 - s1.mean()) / s1.std() + s2 = s.iloc[3:6] + s2 = (s2 - s2.mean()) / s2.std() + expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) + assert_frame_equal(result, expected) + + # int downcasting + result = df.groupby('A').transform(lambda x: x * 2 / 2) + expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) + assert_frame_equal(result, expected) + + +def test_groupby_transform_with_nan_group(): + # GH 9941 + df = pd.DataFrame({'a': range(10), + 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + result = df.groupby(df.b)['a'].transform(max) + expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], + name='a') + assert_series_equal(result, expected) + + +def test_transform_mixed_type(): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] + ]) + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}, index=index) + + def f(group): + group['g'] = group['d'] * 2 + return group[:1] + + grouped = df.groupby('c') + result = grouped.apply(f) + + assert result['d'].dtype == np.float64 + + # this is by definition a mutating operation! + with option_context('mode.chained_assignment', None): + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.loc[key]) + + +def test_cython_group_transform_algos(): + # GH 4095 + dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, + np.uint64, np.float32, np.float64] + + ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]), + (groupby.group_cumsum, np.cumsum, dtypes)] + + is_datetimelike = False + for pd_op, np_op, dtypes in ops: + for dtype in dtypes: + data = np.array([[1], [2], [3], [4]], dtype=dtype) + ans = np.zeros_like(data) + labels = np.array([0, 0, 0, 0], dtype=np.int64) + pd_op(ans, data, labels, is_datetimelike) + tm.assert_numpy_array_equal(np_op(data), ans[:, 0], + check_dtype=False) + + # with nans + labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + + data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') + actual = np.zeros_like(data) + actual.fill(np.nan) + groupby.group_cumprod_float64(actual, data, labels, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') + tm.assert_numpy_array_equal(actual[:, 0], expected) + + actual = np.zeros_like(data) + actual.fill(np.nan) + groupby.group_cumsum(actual, data, labels, is_datetimelike) + expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') + tm.assert_numpy_array_equal(actual[:, 0], expected) + + # timedelta + is_datetimelike = True + data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] + actual = np.zeros_like(data, dtype='int64') + groupby.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) + expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( + 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), + np.timedelta64(5, 'ns')]) + tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) + + +@pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) +def test_cython_transform_series(op, args, targop): + # GH 4095 + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) + + # series + for data in [s, s_missing]: + # print(data.head()) + expected = data.groupby(labels).transform(targop) + + tm.assert_series_equal( + expected, + data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr( + data.groupby(labels), op)(*args)) + + +@pytest.mark.parametrize("op", ['cumprod', 'cumsum']) +@pytest.mark.parametrize("skipna", [False, True]) +@pytest.mark.parametrize('input, exp', [ + # When everything is NaN + ({'key': ['b'] * 10, 'value': np.nan}, + pd.Series([np.nan] * 10, name='value')), + # When there is a single NaN + ({'key': ['b'] * 10 + ['a'] * 2, + 'value': [3] * 3 + [np.nan] + [3] * 8}, + {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., + 2187., 6561., 19683., 3.0, 9.0], + ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., + 21., 24., 27., 3.0, 6.0]})]) +def test_groupby_cum_skipna(op, skipna, input, exp): + df = pd.DataFrame(input) + result = df.groupby('key')['value'].transform(op, skipna=skipna) + if isinstance(exp, dict): + expected = exp[(op, skipna)] + else: + expected = exp + expected = pd.Series(expected, name='value') + tm.assert_series_equal(expected, result) + + +@pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) +def test_cython_transform_frame(op, args, targop): + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) + strings = list('qwertyuiopasdfghjklz') + strings_missing = strings[:] + strings_missing[5] = np.nan + df = DataFrame({'float': s, + 'float_missing': s_missing, + 'int': [1, 1, 1, 1, 2] * 200, + 'datetime': pd.date_range('1990-1-1', periods=1000), + 'timedelta': pd.timedelta_range(1, freq='s', + periods=1000), + 'string': strings * 50, + 'string_missing': strings_missing * 50}, + columns=['float', 'float_missing', 'int', 'datetime', + 'timedelta', 'string', 'string_missing']) + df['cat'] = df['string'].astype('category') + + df2 = df.copy() + df2.index = pd.MultiIndex.from_product([range(100), range(10)]) + + # DataFrame - Single and MultiIndex, + # group by values, index level, columns + for df in [df, df2]: + for gb_target in [dict(by=labels), dict(level=0), dict(by='string') + ]: # dict(by='string_missing')]: + # dict(by=['int','string'])]: + + gb = df.groupby(**gb_target) + # whitelisted methods set the selection before applying + # bit a of hack to make sure the cythonized shift + # is equivalent to pre 0.17.1 behavior + if op == 'shift': + gb._set_group_selection() + + if op != 'shift' and 'int' not in gb_target: + # numeric apply fastpath promotes dtype so have + # to apply separately and concat + i = gb[['int']].apply(targop) + f = gb[['float', 'float_missing']].apply(targop) + expected = pd.concat([f, i], axis=1) + else: + expected = gb.apply(targop) + + expected = expected.sort_index(axis=1) + tm.assert_frame_equal(expected, + gb.transform(op, *args).sort_index( + axis=1)) + tm.assert_frame_equal( expected, - data.groupby(labels).transform(op, *args)) - tm.assert_series_equal(expected, getattr( - data.groupby(labels), op)(*args)) - - @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) - @pytest.mark.parametrize("skipna", [False, True]) - @pytest.mark.parametrize('input, exp', [ - # When everything is NaN - ({'key': ['b'] * 10, 'value': np.nan}, - pd.Series([np.nan] * 10, name='value')), - # When there is a single NaN - ({'key': ['b'] * 10 + ['a'] * 2, - 'value': [3] * 3 + [np.nan] + [3] * 8}, - {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], - ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., - 2187., 6561., 19683., 3.0, 9.0], - ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], - ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., - 21., 24., 27., 3.0, 6.0]})]) - def test_groupby_cum_skipna(self, op, skipna, input, exp): - df = pd.DataFrame(input) - result = df.groupby('key')['value'].transform(op, skipna=skipna) - if isinstance(exp, dict): - expected = exp[(op, skipna)] - else: - expected = exp - expected = pd.Series(expected, name='value') - tm.assert_series_equal(expected, result) - - @pytest.mark.parametrize( - "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) - def test_cython_transform_frame(self, op, args, targop): - s = Series(np.random.randn(1000)) - s_missing = s.copy() - s_missing.iloc[2:10] = np.nan - labels = np.random.randint(0, 50, size=1000).astype(float) - strings = list('qwertyuiopasdfghjklz') - strings_missing = strings[:] - strings_missing[5] = np.nan - df = DataFrame({'float': s, - 'float_missing': s_missing, - 'int': [1, 1, 1, 1, 2] * 200, - 'datetime': pd.date_range('1990-1-1', periods=1000), - 'timedelta': pd.timedelta_range(1, freq='s', - periods=1000), - 'string': strings * 50, - 'string_missing': strings_missing * 50}, - columns=['float', 'float_missing', 'int', 'datetime', - 'timedelta', 'string', 'string_missing']) - df['cat'] = df['string'].astype('category') - - df2 = df.copy() - df2.index = pd.MultiIndex.from_product([range(100), range(10)]) - - # DataFrame - Single and MultiIndex, - # group by values, index level, columns - for df in [df, df2]: - for gb_target in [dict(by=labels), dict(level=0), dict(by='string') - ]: # dict(by='string_missing')]: - # dict(by=['int','string'])]: - - gb = df.groupby(**gb_target) - # whitelisted methods set the selection before applying - # bit a of hack to make sure the cythonized shift - # is equivalent to pre 0.17.1 behavior - if op == 'shift': - gb._set_group_selection() - - if op != 'shift' and 'int' not in gb_target: - # numeric apply fastpath promotes dtype so have - # to apply separately and concat - i = gb[['int']].apply(targop) - f = gb[['float', 'float_missing']].apply(targop) - expected = pd.concat([f, i], axis=1) + getattr(gb, op)(*args).sort_index(axis=1)) + # individual columns + for c in df: + if c not in ['float', 'int', 'float_missing' + ] and op != 'shift': + pytest.raises(DataError, gb[c].transform, op) + pytest.raises(DataError, getattr(gb[c], op)) else: - expected = gb.apply(targop) - - expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, - gb.transform(op, *args).sort_index( - axis=1)) - tm.assert_frame_equal( - expected, - getattr(gb, op)(*args).sort_index(axis=1)) - # individual columns - for c in df: - if c not in ['float', 'int', 'float_missing' - ] and op != 'shift': - pytest.raises(DataError, gb[c].transform, op) - pytest.raises(DataError, getattr(gb[c], op)) - else: - expected = gb[c].apply(targop) - expected.name = c - tm.assert_series_equal(expected, - gb[c].transform(op, *args)) - tm.assert_series_equal(expected, - getattr(gb[c], op)(*args)) - - def test_transform_with_non_scalar_group(self): - # GH 10165 - cols = pd.MultiIndex.from_tuples([ - ('syn', 'A'), ('mis', 'A'), ('non', 'A'), - ('syn', 'C'), ('mis', 'C'), ('non', 'C'), - ('syn', 'T'), ('mis', 'T'), ('non', 'T'), - ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) - df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), - columns=cols, - index=['A', 'C', 'G', 'T']) - tm.assert_raises_regex(ValueError, 'transform must return ' - 'a scalar value for each ' - 'group.*', - df.groupby(axis=1, level=1).transform, - lambda z: z.div(z.sum(axis=1), axis=0)) - - @pytest.mark.parametrize('cols,exp,comp_func', [ - ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal), - (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}), - tm.assert_frame_equal) - ]) - @pytest.mark.parametrize('agg_func', [ - 'count', 'rank', 'size']) - def test_transform_numeric_ret(self, cols, exp, comp_func, agg_func): - if agg_func == 'size' and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with " - "NDFrameGroupy") - - # GH 19200 - df = pd.DataFrame( - {'a': pd.date_range('2018-01-01', periods=3), - 'b': range(3), - 'c': range(7, 10)}) - - result = df.groupby('b')[cols].transform(agg_func) - - if agg_func == 'rank': - exp = exp.astype('float') - - comp_func(result, exp) - - @pytest.mark.parametrize("mix_groupings", [True, False]) - @pytest.mark.parametrize("as_series", [True, False]) - @pytest.mark.parametrize("val1,val2", [ - ('foo', 'bar'), (1, 2), (1., 2.)]) - @pytest.mark.parametrize("fill_method,limit,exp_vals", [ - ("ffill", None, - [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), - ("ffill", 1, - [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), - ("bfill", None, - ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), - ("bfill", 1, - [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) - ]) - def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, - fill_method, limit, exp_vals): - vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] - _exp_vals = list(exp_vals) - # Overwrite placeholder values - for index, exp_val in enumerate(_exp_vals): - if exp_val == 'val1': - _exp_vals[index] = val1 - elif exp_val == 'val2': - _exp_vals[index] = val2 - - # Need to modify values and expectations depending on the - # Series / DataFrame that we ultimately want to generate - if mix_groupings: # ['a', 'b', 'a, 'b', ...] - keys = ['a', 'b'] * len(vals) - - def interweave(list_obj): - temp = list() - for x in list_obj: - temp.extend([x, x]) - - return temp - - _exp_vals = interweave(_exp_vals) - vals = interweave(vals) - else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] - keys = ['a'] * len(vals) + ['b'] * len(vals) - _exp_vals = _exp_vals * 2 - vals = vals * 2 - - df = DataFrame({'key': keys, 'val': vals}) - if as_series: - result = getattr( - df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(_exp_vals, name='val') - assert_series_equal(result, exp) - else: - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'key': keys, 'val': _exp_vals}) - assert_frame_equal(result, exp) - - @pytest.mark.parametrize("test_series", [True, False]) - @pytest.mark.parametrize("periods,fill_method,limit", [ - (1, 'ffill', None), (1, 'ffill', 1), - (1, 'bfill', None), (1, 'bfill', 1), - (-1, 'ffill', None), (-1, 'ffill', 1), - (-1, 'bfill', None), (-1, 'bfill', 1)]) - def test_pct_change(self, test_series, periods, fill_method, limit): - vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] - exp_vals = Series(vals).pct_change(periods=periods, - fill_method=fill_method, - limit=limit).tolist() - - df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), - 'vals': vals * 2}) - grp = df.groupby('key') - - def get_result(grp_obj): - return grp_obj.pct_change(periods=periods, - fill_method=fill_method, - limit=limit) - - if test_series: - exp = pd.Series(exp_vals * 2) - exp.name = 'vals' - grp = grp['vals'] - result = get_result(grp) - tm.assert_series_equal(result, exp) - else: - exp = DataFrame({'vals': exp_vals * 2}) - result = get_result(grp) - tm.assert_frame_equal(result, exp) - - @pytest.mark.parametrize("func", [np.any, np.all]) - def test_any_all_np_func(self, func): - # GH 20653 - df = pd.DataFrame([['foo', True], - [np.nan, True], - ['foo', True]], columns=['key', 'val']) - - exp = pd.Series([True, np.nan, True], name='val') - - res = df.groupby('key')['val'].transform(func) - tm.assert_series_equal(res, exp) + expected = gb[c].apply(targop) + expected.name = c + tm.assert_series_equal(expected, + gb[c].transform(op, *args)) + tm.assert_series_equal(expected, + getattr(gb[c], op)(*args)) + + +def test_transform_with_non_scalar_group(): + # GH 10165 + cols = pd.MultiIndex.from_tuples([ + ('syn', 'A'), ('mis', 'A'), ('non', 'A'), + ('syn', 'C'), ('mis', 'C'), ('non', 'C'), + ('syn', 'T'), ('mis', 'T'), ('non', 'T'), + ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) + df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), + columns=cols, + index=['A', 'C', 'G', 'T']) + tm.assert_raises_regex(ValueError, 'transform must return ' + 'a scalar value for each ' + 'group.*', + df.groupby(axis=1, level=1).transform, + lambda z: z.div(z.sum(axis=1), axis=0)) + + +@pytest.mark.parametrize('cols,exp,comp_func', [ + ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal), + (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}), + tm.assert_frame_equal) +]) +@pytest.mark.parametrize('agg_func', [ + 'count', 'rank', 'size']) +def test_transform_numeric_ret(cols, exp, comp_func, agg_func): + if agg_func == 'size' and isinstance(cols, list): + pytest.xfail("'size' transformation not supported with " + "NDFrameGroupy") + + # GH 19200 + df = pd.DataFrame( + {'a': pd.date_range('2018-01-01', periods=3), + 'b': range(3), + 'c': range(7, 10)}) + + result = df.groupby('b')[cols].transform(agg_func) + + if agg_func == 'rank': + exp = exp.astype('float') + + comp_func(result, exp) + + +@pytest.mark.parametrize("mix_groupings", [True, False]) +@pytest.mark.parametrize("as_series", [True, False]) +@pytest.mark.parametrize("val1,val2", [ + ('foo', 'bar'), (1, 2), (1., 2.)]) +@pytest.mark.parametrize("fill_method,limit,exp_vals", [ + ("ffill", None, + [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), + ("ffill", 1, + [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), + ("bfill", None, + ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), + ("bfill", 1, + [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) +]) +def test_group_fill_methods(mix_groupings, as_series, val1, val2, + fill_method, limit, exp_vals): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == 'val1': + _exp_vals[index] = val1 + elif exp_val == 'val2': + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ['a', 'b'] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ['a'] * len(vals) + ['b'] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({'key': keys, 'val': vals}) + if as_series: + result = getattr( + df.groupby('key')['val'], fill_method)(limit=limit) + exp = Series(_exp_vals, name='val') + assert_series_equal(result, exp) + else: + result = getattr(df.groupby('key'), fill_method)(limit=limit) + exp = DataFrame({'key': keys, 'val': _exp_vals}) + assert_frame_equal(result, exp) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("periods,fill_method,limit", [ + (1, 'ffill', None), (1, 'ffill', 1), + (1, 'bfill', None), (1, 'bfill', 1), + (-1, 'ffill', None), (-1, 'ffill', 1), + (-1, 'bfill', None), (-1, 'bfill', 1)]) +def test_pct_change(test_series, periods, fill_method, limit): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + exp_vals = Series(vals).pct_change(periods=periods, + fill_method=fill_method, + limit=limit).tolist() + + df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), + 'vals': vals * 2}) + grp = df.groupby('key') + + def get_result(grp_obj): + return grp_obj.pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + + if test_series: + exp = pd.Series(exp_vals * 2) + exp.name = 'vals' + grp = grp['vals'] + result = get_result(grp) + tm.assert_series_equal(result, exp) + else: + exp = DataFrame({'vals': exp_vals * 2}) + result = get_result(grp) + tm.assert_frame_equal(result, exp) + + +@pytest.mark.parametrize("func", [np.any, np.all]) +def test_any_all_np_func(func): + # GH 20653 + df = pd.DataFrame([['foo', True], + [np.nan, True], + ['foo', True]], columns=['key', 'val']) + + exp = pd.Series([True, np.nan, True], name='val') + + res = df.groupby('key')['val'].transform(func) + tm.assert_series_equal(res, exp) From 466f90a2b44314b80121d1662a59ef0282547b89 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:14:23 -0400 Subject: [PATCH 28/33] ENH GH20601 raise an error when the number of levels in a pivot table larger than int32 --- pandas/core/reshape/reshape.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 389f1af48434a..13cb0cc08ca74 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -162,6 +162,8 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift + if np.prod(self.full_shape) > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) From dc982ded1a3cdc1757d48f05bae841de11924cf0 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:53:06 -0400 Subject: [PATCH 29/33] TST add a test for pivot table large number of levels causing int32 overflow --- pandas/tests/reshape/test_pivot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1004b40bfb4c1..e6754bcba9283 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1237,6 +1237,14 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): aggfunc=f_numpy) tm.assert_frame_equal(result, expected) + @pytest.mark.slow + def test_pivot_number_of_levels_larger_than_int32(self): + # GH 20601 + data = DataFrame({'ind1': list(range(1337600)) * 2, + 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + class TestCrosstab(object): From ea53febb01efb537851c8432947c41d8ccc44067 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:55:56 -0400 Subject: [PATCH 30/33] CLN PEP8 compliance --- pandas/tests/reshape/test_pivot.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e6754bcba9283..cb8f606dd0f99 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1241,9 +1241,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 data = DataFrame({'ind1': list(range(1337600)) * 2, - 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + 'ind2': list(range(3040)) * 2 * 440, + 'count': [1] * 2 * 1337600}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): - data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + data.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') class TestCrosstab(object): From 50d5e02866a3c5496dbf19720dfa19a2a53f086b Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 02:07:37 -0400 Subject: [PATCH 31/33] DOC add whatsnew entry --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e19aedac80213..9b20f0b4756b4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1221,6 +1221,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) +- Improved error message when the number of levels in a pivot table is too large causing int32 overflow (:issue:`20601`) Other ^^^^^ From 90b76243aad7dd8e4683a89ade88c5b7d146a141 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 02:20:18 -0400 Subject: [PATCH 32/33] ENH catch the int32 overflow error earlier and in two separate places: in pivot_table and unstack --- pandas/core/reshape/pivot.py | 5 +++++ pandas/core/reshape/reshape.py | 7 +++++-- pandas/tests/reshape/test_pivot.py | 8 ++++---- pandas/tests/test_multilevel.py | 7 +++++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74a9b59d3194a..6c3401bdc58e3 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -29,6 +29,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', index = _convert_by(index) columns = _convert_by(columns) + num_rows = data.reindex(index, axis='columns').shape[0] + num_columns = data.reindex(columns, axis='columns').shape[0] + if num_rows * num_columns > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') + if isinstance(aggfunc, list): pieces = [] keys = [] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 13cb0cc08ca74..fe9fc9b674f50 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -127,6 +127,11 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + num_rows = np.max([index_level.size for index_level in self.new_index_levels]) + num_columns = self.removed_level.size + if num_rows * num_columns > (2 ** 31 - 1): + raise ValueError('Unstacked data frame is too big, causing int32 overflow') + self._make_sorted_values_labels() self._make_selectors() @@ -162,8 +167,6 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift - if np.prod(self.full_shape) > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index cb8f606dd0f99..6d0a7358c2c39 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1240,11 +1240,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - data = DataFrame({'ind1': list(range(1337600)) * 2, - 'ind2': list(range(3040)) * 2 * 440, - 'count': [1] * 2 * 1337600}) + df = DataFrame({'ind1': np.arange(2 ** 16), + 'ind2': np.arange(2 ** 16), + 'count': np.arange(2 ** 16)}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): - data.pivot_table(index='ind1', columns='ind2', + df.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 79e05c90a21b0..24ad31de494d8 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1192,6 +1192,13 @@ def test_unstack_unobserved_keys(self): recons = result.stack() tm.assert_frame_equal(recons, df) + @pytest.mark.slow + def test_unstack_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + df.unstack() + def test_stack_order_with_unsorted_levels(self): # GH 16323 From 2416db18c044575bff2f9ee3fd1a306b86059d29 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 02:33:46 -0400 Subject: [PATCH 33/33] CLN edit whatsnew entry and remove old code --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/reshape/reshape.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 9b20f0b4756b4..c5877f10420dc 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1221,7 +1221,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) -- Improved error message when the number of levels in a pivot table is too large causing int32 overflow (:issue:`20601`) +- Improved error message when the number of levels in a pivot table or an unstacked dataframe is too large causing int32 overflow (:issue:`20601`) Other ^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index ed60209c30737..fe9fc9b674f50 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -167,8 +167,6 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift - if np.prod(self.full_shape) > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True)