diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py new file mode 100644 index 0000000000000..bc13d51c4f4f6 --- /dev/null +++ b/pandas/tests/groupby/test_functional.py @@ -0,0 +1,371 @@ +# -*- coding: utf-8 -*- + +""" test function application """ + +import pytest + +from string import ascii_lowercase +from pandas import (date_range, Timestamp, + Index, MultiIndex, DataFrame, Series) +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.compat import product as cart_product + +import numpy as np + +import pandas.util.testing as tm +import pandas as pd +from .common import MixIn + + +# describe +# -------------------------------- + +class TestDescribe(MixIn): + + def test_apply_describe_bug(self): + grouped = self.mframe.groupby(level='first') + grouped.describe() # it works! + + def test_series_describe_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + assert_series_equal(result['mean'], grouped.mean(), check_names=False) + assert_series_equal(result['std'], grouped.std(), check_names=False) + assert_series_equal(result['min'], grouped.min(), check_names=False) + + def test_series_describe_single(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack() + assert_series_equal(result, expected) + + def test_series_index_name(self): + grouped = self.df.loc[:, ['C']].groupby(self.df['A']) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == 'A' + + def test_frame_describe_multikey(self): + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in self.tsframe: + group = grouped[col].describe() + group_col = pd.MultiIndex([[col] * len(group.columns), + group.columns], + [[0] * len(group.columns), + range(len(group.columns))]) + group = pd.DataFrame(group.values, + columns=group_col, + index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + groupedT = self.tsframe.groupby({'A': 0, 'B': 0, + 'C': 1, 'D': 1}, axis=1) + result = groupedT.describe() + expected = self.tsframe.describe().T + expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], + [range(4), range(len(expected.index))]) + tm.assert_frame_equal(result, expected) + + def test_frame_describe_tupleindex(self): + + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, + 'y': [10, 20, 30, 40, 50] * 3, + 'z': [100, 200, 300, 400, 500] * 3}) + df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={'k': 'key'}) + pytest.raises(ValueError, lambda: df1.groupby('k').describe()) + pytest.raises(ValueError, lambda: df2.groupby('key').describe()) + + def test_frame_describe_unstacked_format(self): + # GH 4792 + prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} + volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} + df = pd.DataFrame({'PRICE': prices, + 'VOLUME': volumes}) + result = df.groupby('PRICE').VOLUME.describe() + data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist()] + expected = pd.DataFrame(data, + index=pd.Index([24990, 25499], name='PRICE'), + columns=['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) + + +# nunique +# -------------------------------- + +class TestNUnique(MixIn): + + def test_series_groupby_nunique(self): + + def check_nunique(df, keys, as_index=True): + for sort, dropna in cart_product((False, True), repeat=2): + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr['julie'].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr['julie'].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + assert_series_equal(left, right, check_names=False) + + days = date_range('2015-08-23', periods=10) + + for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): + frame = DataFrame({ + 'jim': np.random.choice( + list(ascii_lowercase), n), + 'joe': np.random.choice(days, n), + 'julie': np.random.randint(0, m, n) + }) + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + + frame.loc[1::17, 'jim'] = None + frame.loc[3::37, 'joe'] = None + frame.loc[7::19, 'julie'] = None + frame.loc[8::19, 'julie'] = None + frame.loc[9::19, 'julie'] = None + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ['jim'], as_index=False) + check_nunique(frame, ['jim', 'joe'], as_index=False) + + def test_nunique(self): + df = DataFrame({ + 'A': list('abbacc'), + 'B': list('abxacc'), + 'C': list('abbacx'), + }) + + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) + result = df.groupby('A', as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list('abc') + expected.index.name = 'A' + result = df.groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, + index=list('abc')) + expected.index.name = 'A' + result = df.replace({'x': None}).groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + def test_nunique_with_object(self): + # GH 11077 + data = pd.DataFrame( + [[100, 1, 'Alice'], + [200, 2, 'Bob'], + [300, 3, 'Charlie'], + [-400, 4, 'Dan'], + [500, 5, 'Edith']], + columns=['amount', 'id', 'name'] + ) + + result = data.groupby(['id', 'amount'])['name'].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = pd.Series([1] * 5, name='name', index=index) + tm.assert_series_equal(result, expected) + + def test_nunique_with_empty_series(self): + # GH 12553 + data = pd.Series(name='name') + result = data.groupby(level=0).nunique() + expected = pd.Series(name='name', dtype='int64') + tm.assert_series_equal(result, expected) + + def test_nunique_with_timegrouper(self): + # GH 13453 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + Timestamp('2016-06-28 16:09:30'), + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}).set_index('time') + result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() + expected = test.groupby( + pd.Grouper(freq='h') + )['data'].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + + +# count +# -------------------------------- + +class TestCount(MixIn): + + def test_groupby_timedelta_cython_count(self): + df = DataFrame({'g': list('ab' * 2), + 'delt': np.arange(4).astype('timedelta64[ns]')}) + expected = Series([ + 2, 2 + ], index=pd.Index(['a', 'b'], name='g'), name='delt') + result = df.groupby('g').delt.count() + tm.assert_series_equal(expected, result) + + def test_count(self): + n = 1 << 15 + dr = date_range('2015-08-30', periods=n // 10, freq='T') + + df = DataFrame({ + '1st': np.random.choice( + list(ascii_lowercase), n), + '2nd': np.random.randint(0, 5, n), + '3rd': np.random.randn(n).round(3), + '4th': np.random.randint(-10, 10, n), + '5th': np.random.choice(dr, n), + '6th': np.random.randn(n).round(3), + '7th': np.random.randn(n).round(3), + '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), + '9th': np.random.choice( + list(ascii_lowercase), n) + }) + + for col in df.columns.drop(['1st', '2nd', '4th']): + df.loc[np.random.choice(n, n // 10), col] = np.nan + + df['9th'] = df['9th'].astype('category') + + for key in '1st', '2nd', ['1st', '2nd']: + left = df.groupby(key).count() + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + assert_frame_equal(left, right) + + # GH5610 + # count counts non-nulls + df = pd.DataFrame([[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, np.nan]], + columns=['A', 'B', 'C']) + + count_as = df.groupby('A').count() + count_not_as = df.groupby('A', as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], + index=[1, 3]) + expected.index.name = 'A' + assert_frame_equal(count_not_as, expected.reset_index()) + assert_frame_equal(count_as, expected) + + count_B = df.groupby('A')['B'].count() + assert_series_equal(count_B, expected['B']) + + def test_count_object(self): + df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 3, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 1, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + def test_count_cross_type(self): # GH8169 + vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( + 0, 2, (100, 2)))) + + df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) + df[df == 2] = np.nan + expected = df.groupby(['c', 'd']).count() + + for t in ['float32', 'object']: + df['a'] = df['a'].astype(t) + df['b'] = df['b'].astype(t) + result = df.groupby(['c', 'd']).count() + tm.assert_frame_equal(result, expected) + + def test_lower_int_prec_count(self): + df = DataFrame({'a': np.array( + [0, 1, 2, 100], np.int8), + 'b': np.array( + [1, 2, 3, 6], np.uint32), + 'c': np.array( + [4, 5, 6, 8], np.int16), + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2], + 'b': [2, 2], + 'c': [2, 2]}, index=pd.Index(list('ab'), + name='grp')) + tm.assert_frame_equal(result, expected) + + def test_count_uses_size_on_exception(self): + class RaisingObjectException(Exception): + pass + + class RaisingObject(object): + + def __init__(self, msg='I will raise inside Cython'): + super(RaisingObject, self).__init__() + self.msg = msg + + def __eq__(self, other): + # gets called in Cython to check that raising calls the method + raise RaisingObjectException(self.msg) + + df = DataFrame({'a': [RaisingObject() for _ in range(4)], + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2]}, index=pd.Index( + list('ab'), name='grp')) + tm.assert_frame_equal(result, expected) + + +# size +# -------------------------------- + +class TestSize(MixIn): + + def test_size(self): + grouped = self.df.groupby(['A', 'B']) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = self.df.groupby('A') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = self.df.groupby('B') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) + for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): + left = df.groupby(key, sort=sort).size() + right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) + assert_series_equal(left, right, check_names=False) + + # GH11699 + df = DataFrame([], columns=['A', 'B']) + out = Series([], dtype='int64', index=Index([], name='A')) + assert_series_equal(df.groupby('A').size(), out) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9d25117fbd954..6f022aeff577b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4,19 +4,16 @@ import pytest from warnings import catch_warnings -from string import ascii_lowercase from datetime import datetime -from numpy import nan from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, - concat, Panel, DatetimeIndex) + concat, Panel, DatetimeIndex, read_csv) from pandas.errors import UnsupportedFunctionCall, PerformanceWarning -from pandas.util.testing import (assert_panel_equal, assert_frame_equal, - assert_series_equal, assert_almost_equal, - assert_index_equal) +from pandas.util.testing import (assert_frame_equal, assert_index_equal, + assert_series_equal, assert_almost_equal) from pandas.compat import (range, long, lrange, StringIO, lmap, lzip, map, zip, - builtins, OrderedDict, product as cart_product) + builtins, OrderedDict) from pandas import compat from collections import defaultdict import pandas.core.common as com @@ -76,261 +73,6 @@ def checkit(dtype): for dtype in ['int64', 'int32', 'float64', 'float32']: checkit(dtype) - def test_select_bad_cols(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) - g = df.groupby('A') - pytest.raises(KeyError, g.__getitem__, ['C']) # g[['C']] - - pytest.raises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']] - with tm.assert_raises_regex(KeyError, '^[^A]+$'): - # A should not be referenced as a bad column... - # will have to rethink regex if you change message! - g[['A', 'C']] - - def test_group_selection_cache(self): - # GH 12839 nth, head, and tail should return same result consistently - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - expected = df.iloc[[0, 2]].set_index('A') - - g = df.groupby('A') - result1 = g.head(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.tail(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.head(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.tail(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) - - def test_grouper_index_types(self): - # related GH5375 - # groupby misbehaving when using a Floatlike index - df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB')) - for index in [tm.makeFloatIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, - tm.makePeriodIndex]: - - df.index = index(len(df)) - df.groupby(list('abcde')).apply(lambda x: x) - - df.index = list(reversed(df.index.tolist())) - df.groupby(list('abcde')).apply(lambda x: x) - - def test_grouper_multilevel_freq(self): - - # GH 7885 - # with level and freq specified in a pd.Grouper - from datetime import date, timedelta - d0 = date.today() - timedelta(days=14) - dates = date_range(d0, date.today()) - date_index = pd.MultiIndex.from_product( - [dates, dates], names=['foo', 'bar']) - df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) - - # Check string level - expected = df.reset_index().groupby([pd.Grouper( - key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum() - # reset index changes columns dtype to object - expected.columns = pd.Index([0], dtype='int64') - - result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper( - level='bar', freq='W')]).sum() - assert_frame_equal(result, expected) - - # Check integer level - result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper( - level=1, freq='W')]).sum() - assert_frame_equal(result, expected) - - def test_grouper_creation_bug(self): - - # GH 8795 - df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) - g = df.groupby('A') - expected = g.sum() - - g = df.groupby(pd.Grouper(key='A')) - result = g.sum() - assert_frame_equal(result, expected) - - result = g.apply(lambda x: x.sum()) - assert_frame_equal(result, expected) - - g = df.groupby(pd.Grouper(key='A', axis=0)) - result = g.sum() - assert_frame_equal(result, expected) - - # GH14334 - # pd.Grouper(key=...) may be passed in a list - df = DataFrame({'A': [0, 0, 0, 1, 1, 1], - 'B': [1, 1, 2, 2, 3, 3], - 'C': [1, 2, 3, 4, 5, 6]}) - # Group by single column - expected = df.groupby('A').sum() - g = df.groupby([pd.Grouper(key='A')]) - result = g.sum() - assert_frame_equal(result, expected) - - # Group by two columns - # using a combination of strings and Grouper objects - expected = df.groupby(['A', 'B']).sum() - - # Group with two Grouper objects - g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) - result = g.sum() - assert_frame_equal(result, expected) - - # Group with a string and a Grouper object - g = df.groupby(['A', pd.Grouper(key='B')]) - result = g.sum() - assert_frame_equal(result, expected) - - # Group with a Grouper object and a string - g = df.groupby([pd.Grouper(key='A'), 'B']) - result = g.sum() - assert_frame_equal(result, expected) - - # GH8866 - s = Series(np.arange(8, dtype='int64'), - index=pd.MultiIndex.from_product( - [list('ab'), range(2), - date_range('20130101', periods=2)], - names=['one', 'two', 'three'])) - result = s.groupby(pd.Grouper(level='three', freq='M')).sum() - expected = Series([28], index=Index( - [Timestamp('2013-01-31')], freq='M', name='three')) - assert_series_equal(result, expected) - - # just specifying a level breaks - result = s.groupby(pd.Grouper(level='one')).sum() - expected = s.groupby(level='one').sum() - assert_series_equal(result, expected) - - def test_grouper_column_and_index(self): - # GH 14327 - - # Grouping a multi-index frame by a column and an index level should - # be equivalent to resetting the index and grouping by two columns - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3)]) - idx.names = ['outer', 'inner'] - df_multi = pd.DataFrame({"A": np.arange(6), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one']}, - index=idx) - result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_multi.reset_index().groupby(['B', 'inner']).mean() - assert_frame_equal(result, expected) - - # Test the reverse grouping order - result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_multi.reset_index().groupby(['inner', 'B']).mean() - assert_frame_equal(result, expected) - - # Grouping a single-index frame by a column and the index should - # be equivalent to resetting the index and grouping by two columns - df_single = df_multi.reset_index('outer') - result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_single.reset_index().groupby(['B', 'inner']).mean() - assert_frame_equal(result, expected) - - # Test the reverse grouping order - result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_single.reset_index().groupby(['inner', 'B']).mean() - assert_frame_equal(result, expected) - - def test_grouper_getting_correct_binner(self): - - # GH 10063 - # using a non-time-based grouper and a time-based grouper - # and specifying levels - df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product( - [list('ab'), date_range('20130101', periods=80)], names=['one', - 'two'])) - result = df.groupby([pd.Grouper(level='one'), pd.Grouper( - level='two', freq='M')]).sum() - expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]}, - index=MultiIndex.from_product( - [list('ab'), - date_range('20130101', freq='M', periods=3)], - names=['one', 'two'])) - assert_frame_equal(result, expected) - - def test_grouper_iter(self): - assert sorted(self.df.groupby('A').grouper) == ['bar', 'foo'] - - def test_empty_groups(self): - # see gh-1048 - pytest.raises(ValueError, self.df.groupby, []) - - def test_groupby_grouper(self): - grouped = self.df.groupby('A') - - result = self.df.groupby(grouped.grouper).mean() - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - def test_groupby_duplicated_column_errormsg(self): - # GH7511 - df = DataFrame(columns=['A', 'B', 'A', 'C'], - data=[range(4), range(2, 6), range(0, 8, 2)]) - - pytest.raises(ValueError, df.groupby, 'A') - pytest.raises(ValueError, df.groupby, ['A', 'B']) - - grouped = df.groupby('B') - c = grouped.count() - assert c.columns.nlevels == 1 - assert c.columns.size == 3 - - def test_groupby_dict_mapping(self): - # GH #679 - from pandas import Series - s = Series({'T1': 5}) - result = s.groupby({'T1': 'T2'}).agg(sum) - expected = s.groupby(['T2']).agg(sum) - assert_series_equal(result, expected) - - s = Series([1., 2., 3., 4.], index=list('abcd')) - mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} - - result = s.groupby(mapping).mean() - result2 = s.groupby(mapping).agg(np.mean) - expected = s.groupby([0, 0, 1, 1]).mean() - expected2 = s.groupby([0, 0, 1, 1]).mean() - assert_series_equal(result, expected) - assert_series_equal(result, result2) - assert_series_equal(result, expected2) - - def test_groupby_grouper_f_sanity_checked(self): - dates = date_range('01-Jan-2013', periods=12, freq='MS') - ts = Series(np.random.randn(12), index=dates) - - # GH3035 - # index.map is used to apply grouper to the index - # if it fails on the elements, map tries it on the entire index as - # a sequence. That can yield invalid results that cause trouble - # down the line. - # the surprise comes from using key[0:6] rather then str(key)[0:6] - # when the elements are Timestamp. - # the result is Index[0:6], very confusing. - - pytest.raises(AssertionError, ts.groupby, lambda key: key[0:6]) - def test_groupby_nonobject_dtype(self): key = self.mframe.index.labels[0] grouped = self.mframe.groupby(key) @@ -444,86 +186,6 @@ def f(grp): e.name = None assert_series_equal(result, e) - def test_get_group(self): - with catch_warnings(record=True): - wp = tm.makePanel() - grouped = wp.groupby(lambda x: x.month, axis='major') - - gp = grouped.get_group(1) - expected = wp.reindex( - major=[x for x in wp.major_axis if x.month == 1]) - assert_panel_equal(gp, expected) - - # GH 5267 - # be datelike friendly - df = DataFrame({'DATE': pd.to_datetime( - ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013', - '11-Oct-2013', '11-Oct-2013']), - 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'], - 'VAL': [1, 2, 3, 4, 5, 6]}) - - g = df.groupby('DATE') - key = list(g.groups)[0] - result1 = g.get_group(key) - result2 = g.get_group(Timestamp(key).to_pydatetime()) - result3 = g.get_group(str(Timestamp(key))) - assert_frame_equal(result1, result2) - assert_frame_equal(result1, result3) - - g = df.groupby(['DATE', 'label']) - - key = list(g.groups)[0] - result1 = g.get_group(key) - result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1])) - result3 = g.get_group((str(Timestamp(key[0])), key[1])) - assert_frame_equal(result1, result2) - assert_frame_equal(result1, result3) - - # must pass a same-length tuple with multiple keys - pytest.raises(ValueError, lambda: g.get_group('foo')) - pytest.raises(ValueError, lambda: g.get_group(('foo'))) - pytest.raises(ValueError, - lambda: g.get_group(('foo', 'bar', 'baz'))) - - def test_get_group_empty_bins(self): - - d = pd.DataFrame([3, 1, 7, 6]) - bins = [0, 5, 10, 15] - g = d.groupby(pd.cut(d[0], bins)) - - # TODO: should prob allow a str of Interval work as well - # IOW '(0, 5]' - result = g.get_group(pd.Interval(0, 5)) - expected = DataFrame([3, 1], index=[0, 1]) - assert_frame_equal(result, expected) - - pytest.raises(KeyError, lambda: g.get_group(pd.Interval(10, 15))) - - def test_get_group_grouped_by_tuple(self): - # GH 8121 - df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T - gr = df.groupby('ids') - expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2]) - result = gr.get_group((1, )) - assert_frame_equal(result, expected) - - dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01', - '2010-01-02']) - df = DataFrame({'ids': [(x, ) for x in dt]}) - gr = df.groupby('ids') - result = gr.get_group(('2010-01-01', )) - expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) - assert_frame_equal(result, expected) - - def test_grouping_error_on_multidim_input(self): - from pandas.core.groupby import Grouping - pytest.raises(ValueError, - Grouping, self.df.index, self.df[['A', 'A']]) - - def test_apply_describe_bug(self): - grouped = self.mframe.groupby(level='first') - grouped.describe() # it works! - def test_apply_issues(self): # GH 5788 @@ -604,22 +266,6 @@ def test_len(self): assert len(df.groupby(('b'))) == 3 assert len(df.groupby(('a', 'b'))) == 3 - def test_groups(self): - grouped = self.df.groupby(['A']) - groups = grouped.groups - assert groups is grouped.groups # caching works - - for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k).all() - - grouped = self.df.groupby(['A', 'B']) - groups = grouped.groups - assert groups is grouped.groups # caching works - - for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k[0]).all() - assert (self.df.loc[v]['B'] == k[1]).all() - def test_basic_regression(self): # regression T = [1.0 * x for x in lrange(1, 10) * 10][:1095] @@ -631,13 +277,13 @@ def test_basic_regression(self): grouped = result.groupby(groupings) grouped.mean() - def test_with_na(self): + def test_with_na_groups(self): index = Index(np.arange(10)) for dtype in ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']: values = Series(np.ones(10), index, dtype=dtype) - labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar', - 'bar', nan, 'foo'], index=index) + labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, + 'bar', 'bar', np.nan, 'foo'], index=index) # this SHOULD be an int grouped = values.groupby(labels) @@ -730,81 +376,6 @@ def test_attr_wrapper(self): # make sure raises error pytest.raises(AttributeError, getattr, grouped, 'foo') - def test_series_describe_multikey(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - assert_series_equal(result['mean'], grouped.mean(), check_names=False) - assert_series_equal(result['std'], grouped.std(), check_names=False) - assert_series_equal(result['min'], grouped.min(), check_names=False) - - def test_series_describe_single(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack() - assert_series_equal(result, expected) - - def test_series_index_name(self): - grouped = self.df.loc[:, ['C']].groupby(self.df['A']) - result = grouped.agg(lambda x: x.mean()) - assert result.index.name == 'A' - - def test_frame_describe_multikey(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in self.tsframe: - group = grouped[col].describe() - group_col = pd.MultiIndex([[col] * len(group.columns), - group.columns], - [[0] * len(group.columns), - range(len(group.columns))]) - group = pd.DataFrame(group.values, - columns=group_col, - index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - groupedT = self.tsframe.groupby({'A': 0, 'B': 0, - 'C': 1, 'D': 1}, axis=1) - result = groupedT.describe() - expected = self.tsframe.describe().T - expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], - [range(4), range(len(expected.index))]) - tm.assert_frame_equal(result, expected) - - def test_frame_describe_tupleindex(self): - - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, - 'y': [10, 20, 30, 40, 50] * 3, - 'z': [100, 200, 300, 400, 500] * 3}) - df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={'k': 'key'}) - pytest.raises(ValueError, lambda: df1.groupby('k').describe()) - pytest.raises(ValueError, lambda: df2.groupby('key').describe()) - - def test_frame_describe_unstacked_format(self): - # GH 4792 - prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} - volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} - df = pd.DataFrame({'PRICE': prices, - 'VOLUME': volumes}) - result = df.groupby('PRICE').VOLUME.describe() - data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist()] - expected = pd.DataFrame(data, - index=pd.Index([24990, 25499], name='PRICE'), - columns=['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']) - tm.assert_frame_equal(result, expected) - def test_frame_groupby(self): grouped = self.tsframe.groupby(lambda x: x.weekday()) @@ -845,16 +416,6 @@ def test_frame_groupby(self): samething = self.tsframe.index.take(indices[k]) assert (samething == v).all() - def test_grouping_is_iterable(self): - # this code path isn't used anywhere else - # not sure it's useful - grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year - ]) - - # test it works - for g in grouped.grouper.groupings[0]: - pass - def test_frame_groupby_columns(self): mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} grouped = self.tsframe.groupby(mapping, axis=1) @@ -900,73 +461,6 @@ def test_frame_set_name_single(self): result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) assert result.index.name == 'A' - def test_multi_iter(self): - s = Series(np.arange(6)) - k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) - - grouped = s.groupby([k1, k2]) - - iterated = list(grouped) - expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]), - ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])] - for i, ((one, two), three) in enumerate(iterated): - e1, e2, e3 = expected[i] - assert e1 == one - assert e2 == two - assert_series_equal(three, e3) - - def test_multi_iter_frame(self): - k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': k1, 'k2': k2}, - index=['one', 'two', 'three', 'four', 'five', 'six']) - - grouped = df.groupby(['k1', 'k2']) - - # things get sorted! - iterated = list(grouped) - idx = df.index - expected = [('a', '1', df.loc[idx[[4]]]), - ('a', '2', df.loc[idx[[3, 5]]]), - ('b', '1', df.loc[idx[[0, 2]]]), - ('b', '2', df.loc[idx[[1]]])] - for i, ((one, two), three) in enumerate(iterated): - e1, e2, e3 = expected[i] - assert e1 == one - assert e2 == two - assert_frame_equal(three, e3) - - # don't iterate through groups with no data - df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) - grouped = df.groupby(['k1', 'k2']) - groups = {} - for key, gp in grouped: - groups[key] = gp - assert len(groups) == 2 - - # axis = 1 - three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() - grouped = three_levels.T.groupby(axis=1, level=(1, 2)) - for key, group in grouped: - pass - - def test_multi_iter_panel(self): - with catch_warnings(record=True): - wp = tm.makePanel() - grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()], - axis=1) - - for (month, wd), group in grouped: - exp_axis = [x - for x in wp.major_axis - if x.month == month and x.weekday() == wd] - expected = wp.reindex(major=exp_axis) - assert_panel_equal(group, expected) - def test_multi_func(self): col1 = self.df['A'] col2 = self.df['B'] @@ -1115,79 +609,6 @@ def test_groupby_as_index_agg(self): assert_frame_equal(left, right) - def test_series_groupby_nunique(self): - - def check_nunique(df, keys, as_index=True): - for sort, dropna in cart_product((False, True), repeat=2): - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr['julie'].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr['julie'].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - assert_series_equal(left, right, check_names=False) - - days = date_range('2015-08-23', periods=10) - - for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): - frame = DataFrame({ - 'jim': np.random.choice( - list(ascii_lowercase), n), - 'joe': np.random.choice(days, n), - 'julie': np.random.randint(0, m, n) - }) - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - - frame.loc[1::17, 'jim'] = None - frame.loc[3::37, 'joe'] = None - frame.loc[7::19, 'julie'] = None - frame.loc[8::19, 'julie'] = None - frame.loc[9::19, 'julie'] = None - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - check_nunique(frame, ['jim'], as_index=False) - check_nunique(frame, ['jim', 'joe'], as_index=False) - - def test_multiindex_passthru(self): - - # GH 7997 - # regression from 0.14.1 - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) - - result = df.groupby(axis=1, level=[0, 1]).first() - assert_frame_equal(result, df) - - def test_multiindex_negative_level(self): - # GH 13901 - result = self.mframe.groupby(level=-1).sum() - expected = self.mframe.groupby(level='second').sum() - assert_frame_equal(result, expected) - - result = self.mframe.groupby(level=-2).sum() - expected = self.mframe.groupby(level='first').sum() - assert_frame_equal(result, expected) - - result = self.mframe.groupby(level=[-2, -1]).sum() - expected = self.mframe - assert_frame_equal(result, expected) - - result = self.mframe.groupby(level=[-1, 'first']).sum() - expected = self.mframe.groupby(level=['second', 'first']).sum() - assert_frame_equal(result, expected) - - def test_multifunc_select_col_integer_cols(self): - df = self.df - df.columns = np.arange(len(df.columns)) - - # it works! - df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) - def test_as_index_series_return_frame(self): grouped = self.df.groupby('A', as_index=False) grouped2 = self.df.groupby(['A', 'B'], as_index=False) @@ -1286,55 +707,6 @@ def test_groupby_as_index_apply(self): res = df.groupby(0, as_index=False).apply(lambda x: x).index assert_index_equal(res, ind) - def test_groupby_head_tail(self): - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g_as = df.groupby('A', as_index=True) - g_not_as = df.groupby('A', as_index=False) - - # as_index= False, much easier - assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) - assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - - empty_not_as = DataFrame(columns=df.columns, - index=pd.Index([], dtype=df.index.dtype)) - empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_not_as, g_not_as.head(0)) - assert_frame_equal(empty_not_as, g_not_as.tail(0)) - assert_frame_equal(empty_not_as, g_not_as.head(-1)) - assert_frame_equal(empty_not_as, g_not_as.tail(-1)) - - assert_frame_equal(df, g_not_as.head(7)) # contains all - assert_frame_equal(df, g_not_as.tail(7)) - - # as_index=True, (used to be different) - df_as = df - - assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) - assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) - - empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) - empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_as, g_as.head(0)) - assert_frame_equal(empty_as, g_as.tail(0)) - assert_frame_equal(empty_as, g_as.head(-1)) - assert_frame_equal(empty_as, g_as.tail(-1)) - - assert_frame_equal(df_as, g_as.head(7)) # contains all - assert_frame_equal(df_as, g_as.tail(7)) - - # test with selection - assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - - assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, lambda x: x.month, @@ -1613,15 +985,6 @@ def test_arg_passthru(self): result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - def test_groupby_timedelta_cython_count(self): - df = DataFrame({'g': list('ab' * 2), - 'delt': np.arange(4).astype('timedelta64[ns]')}) - expected = Series([ - 2, 2 - ], index=pd.Index(['a', 'b'], name='g'), name='delt') - result = df.groupby('g').delt.count() - tm.assert_series_equal(expected, result) - def test_wrap_aggregated_output_multindex(self): df = self.mframe.T df['baz', 'two'] = 'peekaboo' @@ -1639,75 +1002,6 @@ def aggfun(ser): agged2 = df.groupby(keys).aggregate(aggfun) assert len(agged2.columns) + 1 == len(df.columns) - @pytest.mark.parametrize('sort', [True, False]) - def test_groupby_level(self, sort): - # GH 17537 - frame = self.mframe - deleveled = frame.reset_index() - - result0 = frame.groupby(level=0, sort=sort).sum() - result1 = frame.groupby(level=1, sort=sort).sum() - - expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() - expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() - - expected0.index.name = 'first' - expected1.index.name = 'second' - - assert result0.index.name == 'first' - assert result1.index.name == 'second' - - assert_frame_equal(result0, expected0) - assert_frame_equal(result1, expected1) - assert result0.index.name == frame.index.names[0] - assert result1.index.name == frame.index.names[1] - - # groupby level name - result0 = frame.groupby(level='first', sort=sort).sum() - result1 = frame.groupby(level='second', sort=sort).sum() - assert_frame_equal(result0, expected0) - assert_frame_equal(result1, expected1) - - # axis=1 - - result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() - result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() - assert_frame_equal(result0, expected0.T) - assert_frame_equal(result1, expected1.T) - - # raise exception for non-MultiIndex - pytest.raises(ValueError, self.df.groupby, level=1) - - def test_groupby_level_index_names(self): - # GH4014 this used to raise ValueError since 'exp'>1 (in py2) - df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3, - 'var1': lrange(6), }).set_index('exp') - df.groupby(level='exp') - pytest.raises(ValueError, df.groupby, level='foo') - - @pytest.mark.parametrize('sort', [True, False]) - def test_groupby_level_with_nas(self, sort): - # GH 17537 - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, - 2, 3]]) - - # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) - result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 22.], index=[0, 1]) - assert_series_equal(result, expected) - - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, - 1, 2, 3]]) - - # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) - result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 18.], index=[0.0, 1.0]) - assert_series_equal(result, expected) - def test_groupby_level_apply(self): frame = self.mframe @@ -1719,22 +1013,6 @@ def test_groupby_level_apply(self): result = frame['A'].groupby(level=0).count() assert result.index.name == 'first' - def test_groupby_args(self): - # PR8618 and issue 8015 - frame = self.mframe - - def j(): - frame.groupby() - - tm.assert_raises_regex(TypeError, "You have to supply one of " - "'by' and 'level'", j) - - def k(): - frame.groupby(by=None, level=None) - - tm.assert_raises_regex(TypeError, "You have to supply one of " - "'by' and 'level'", k) - def test_groupby_level_mapper(self): frame = self.mframe deleveled = frame.reset_index() @@ -1788,21 +1066,6 @@ def test_groupby_complex(self): result = a.sum(level=0) assert_series_equal(result, expected) - @pytest.mark.parametrize('sort,labels', [ - [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], - [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] - ]) - def test_level_preserve_order(self, sort, labels): - # GH 17537 - grouped = self.mframe.groupby(level=0, sort=sort) - exp_labels = np.array(labels, np.intp) - assert_almost_equal(grouped.grouper.labels[0], exp_labels) - - def test_grouping_labels(self): - grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) - exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) - assert_almost_equal(grouped.grouper.labels[0], exp_labels) - def test_apply_series_to_frame(self): def f(piece): with np.errstate(invalid='ignore'): @@ -2014,157 +1277,26 @@ def f(x, q=None, axis=0): assert_frame_equal(agg_result, expected, check_names=False) assert_frame_equal(apply_result, expected) - def test_size(self): - grouped = self.df.groupby(['A', 'B']) - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('A') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('B') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) - for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): - left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) - assert_series_equal(left, right, check_names=False) - - # GH11699 - df = DataFrame([], columns=['A', 'B']) - out = Series([], dtype='int64', index=Index([], name='A')) - assert_series_equal(df.groupby('A').size(), out) - - def test_count(self): - from string import ascii_lowercase - n = 1 << 15 - dr = date_range('2015-08-30', periods=n // 10, freq='T') - - df = DataFrame({ - '1st': np.random.choice( - list(ascii_lowercase), n), - '2nd': np.random.randint(0, 5, n), - '3rd': np.random.randn(n).round(3), - '4th': np.random.randint(-10, 10, n), - '5th': np.random.choice(dr, n), - '6th': np.random.randn(n).round(3), - '7th': np.random.randn(n).round(3), - '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), - '9th': np.random.choice( - list(ascii_lowercase), n) - }) - - for col in df.columns.drop(['1st', '2nd', '4th']): - df.loc[np.random.choice(n, n // 10), col] = np.nan - - df['9th'] = df['9th'].astype('category') - - for key in '1st', '2nd', ['1st', '2nd']: - left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) - assert_frame_equal(left, right) - - # GH5610 - # count counts non-nulls - df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], - columns=['A', 'B', 'C']) - - count_as = df.groupby('A').count() - count_not_as = df.groupby('A', as_index=False).count() - - expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' - assert_frame_equal(count_not_as, expected.reset_index()) - assert_frame_equal(count_as, expected) - - count_B = df.groupby('A')['B'].count() - assert_series_equal(count_B, expected['B']) - - def test_count_object(self): - df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 3, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, - 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 1, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - def test_count_cross_type(self): # GH8169 - vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( - 0, 2, (100, 2)))) - - df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) - df[df == 2] = np.nan - expected = df.groupby(['c', 'd']).count() - - for t in ['float32', 'object']: - df['a'] = df['a'].astype(t) - df['b'] = df['b'].astype(t) - result = df.groupby(['c', 'd']).count() - tm.assert_frame_equal(result, expected) - - def test_nunique(self): - df = DataFrame({ - 'A': list('abbacc'), - 'B': list('abxacc'), - 'C': list('abbacx'), - }) - - expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) - result = df.groupby('A', as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list('abc') - expected.index.name = 'A' - result = df.groupby('A').nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({'x': None}).groupby('A').nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, - index=list('abc')) - expected.index.name = 'A' - result = df.replace({'x': None}).groupby('A').nunique() - tm.assert_frame_equal(result, expected) - def test_non_cython_api(self): # GH5610 # non-cython calls should not include the grouper df = DataFrame( - [[1, 2, 'foo'], [1, - nan, - 'bar', ], [3, nan, 'baz'] - ], columns=['A', 'B', 'C']) + [[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, 'baz']], + columns=['A', 'B', 'C']) g = df.groupby('A') gni = df.groupby('A', as_index=False) # mad - expected = DataFrame([[0], [nan]], columns=['B'], index=[1, 3]) + expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) expected.index.name = 'A' result = g.mad() assert_frame_equal(result, expected) - expected = DataFrame([[0., 0.], [0, nan]], columns=['A', 'B'], + expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], index=[0, 1]) result = gni.mad() assert_frame_equal(result, expected) @@ -2175,8 +1307,9 @@ def test_non_cython_api(self): ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']], labels=[[0] * 8, list(range(8))]) - expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, nan, nan, nan, nan, nan, nan, nan]], + expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan]], index=expected_index, columns=expected_col) result = g.describe() @@ -2196,7 +1329,7 @@ def test_non_cython_api(self): assert_frame_equal(result, expected) # idxmax - expected = DataFrame([[0.0], [nan]], columns=['B'], index=[1, 3]) + expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) expected.index.name = 'A' result = g.idxmax() assert_frame_equal(result, expected) @@ -2319,7 +1452,6 @@ def f(g): assert 'value3' in result def test_groupby_wrong_multi_labels(self): - from pandas import read_csv data = """index,foo,bar,baz,spam,data 0,foo1,bar1,baz1,spam2,20 1,foo1,bar2,baz1,spam3,30 @@ -2620,14 +1752,6 @@ def test_groupby_nat_exclude(self): pytest.raises(KeyError, grouped.get_group, np.nan) pytest.raises(KeyError, grouped.get_group, pd.NaT) - def test_dictify(self): - dict(iter(self.df.groupby('A'))) - dict(iter(self.df.groupby(['A', 'B']))) - dict(iter(self.df['C'].groupby(self.df['A']))) - dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) - dict(iter(self.df.groupby('A')['C'])) - dict(iter(self.df.groupby(['A', 'B'])['C'])) - def test_sparse_friendly(self): sdf = self.df[['C', 'D']].to_sparse() with catch_warnings(record=True): @@ -2734,16 +1858,6 @@ def test_intercept_builtin_sum(self): assert_series_equal(result, expected) assert_series_equal(result2, expected) - def test_column_select_via_attr(self): - result = self.df.groupby('A').C.sum() - expected = self.df.groupby('A')['C'].sum() - assert_series_equal(result, expected) - - self.df['mean'] = 1.5 - result = self.df.groupby('A').mean() - expected = self.df.groupby('A').agg(np.mean) - assert_frame_equal(result, expected) - def test_rank_apply(self): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) @@ -2835,40 +1949,6 @@ def g(group): assert isinstance(result, Series) assert_series_equal(result, expected) - def test_getitem_list_of_columns(self): - df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), - 'E': np.random.randn(8)}) - - result = df.groupby('A')[['C', 'D']].mean() - result2 = df.groupby('A')['C', 'D'].mean() - result3 = df.groupby('A')[df.columns[2:4]].mean() - - expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() - - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - - def test_getitem_numeric_column_names(self): - # GH #13731 - df = DataFrame({0: list('abcd') * 2, - 2: np.random.randn(8), - 4: np.random.randn(8), - 6: np.random.randn(8)}) - result = df.groupby(0)[df.columns[1:3]].mean() - result2 = df.groupby(0)[2, 4].mean() - result3 = df.groupby(0)[[2, 4]].mean() - - expected = df.loc[:, [0, 2, 4]].groupby(0).mean() - - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - def test_set_group_name(self): def f(group): assert group.name is not None @@ -3149,22 +2229,6 @@ def test_groupby_multiindex_not_lexsorted(self): expected = df.sort_index() tm.assert_frame_equal(expected, result) - def test_groupby_levels_and_columns(self): - # GH9344, GH9049 - idx_names = ['x', 'y'] - idx = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) - df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) - - by_levels = df.groupby(level=idx_names).mean() - # reset_index changes columns dtype to object - by_columns = df.reset_index().groupby(idx_names).mean() - - tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) - - by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) - tm.assert_frame_equal(by_levels, by_columns) - def test_gb_apply_list_of_unequal_len_arrays(self): # GH1738 @@ -3189,74 +2253,6 @@ def noddy(value, weight): # don't die df_grouped.apply(lambda x: noddy(x.value, x.weight)) - def test_groupby_with_empty(self): - index = pd.DatetimeIndex(()) - data = () - series = pd.Series(data, index) - grouper = pd.Grouper(freq='D') - grouped = series.groupby(grouper) - assert next(iter(grouped), None) is None - - def test_groupby_with_single_column(self): - df = pd.DataFrame({'a': list('abssbab')}) - tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) - # GH 13530 - exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a')) - tm.assert_frame_equal(df.groupby('a').count(), exp) - tm.assert_frame_equal(df.groupby('a').sum(), exp) - tm.assert_frame_equal(df.groupby('a').nth(1), exp) - - def test_groupby_with_small_elem(self): - # GH 8542 - # length=2 - df = pd.DataFrame({'event': ['start', 'start'], - 'change': [1234, 5678]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) - assert len(grouped.groups) == 2 - assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) - tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[1], :]) - - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-09-15'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) - assert len(grouped.groups) == 2 - assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) - tm.assert_frame_equal(res, df.iloc[[0, 2], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[1], :]) - - # length=3 - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-08-05'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) - assert len(grouped.groups) == 3 - assert grouped.ngroups == 3 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups - - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) - tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[1], :]) - res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[2], :]) - def test_fill_constistency(self): # GH9221 @@ -3303,42 +2299,6 @@ def test_index_label_overlaps_location(self): expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) - def test_lower_int_prec_count(self): - df = DataFrame({'a': np.array( - [0, 1, 2, 100], np.int8), - 'b': np.array( - [1, 2, 3, 6], np.uint32), - 'c': np.array( - [4, 5, 6, 8], np.int16), - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2], - 'b': [2, 2], - 'c': [2, 2]}, index=pd.Index(list('ab'), - name='grp')) - tm.assert_frame_equal(result, expected) - - def test_count_uses_size_on_exception(self): - class RaisingObjectException(Exception): - pass - - class RaisingObject(object): - - def __init__(self, msg='I will raise inside Cython'): - super(RaisingObject, self).__init__() - self.msg = msg - - def __eq__(self, other): - # gets called in Cython to check that raising calls the method - raise RaisingObjectException(self.msg) - - df = DataFrame({'a': [RaisingObject() for _ in range(4)], - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2]}, index=pd.Index( - list('ab'), name='grp')) - tm.assert_frame_equal(result, expected) - def test_groupby_cumprod(self): # GH 4095 df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) @@ -3510,42 +2470,6 @@ def test_sort(x): g.apply(test_sort) - def test_nunique_with_object(self): - # GH 11077 - data = pd.DataFrame( - [[100, 1, 'Alice'], - [200, 2, 'Bob'], - [300, 3, 'Charlie'], - [-400, 4, 'Dan'], - [500, 5, 'Edith']], - columns=['amount', 'id', 'name'] - ) - - result = data.groupby(['id', 'amount'])['name'].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name='name', index=index) - tm.assert_series_equal(result, expected) - - def test_nunique_with_empty_series(self): - # GH 12553 - data = pd.Series(name='name') - result = data.groupby(level=0).nunique() - expected = pd.Series(name='name', dtype='int64') - tm.assert_series_equal(result, expected) - - def test_nunique_with_timegrouper(self): - # GH 13453 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - Timestamp('2016-06-28 16:09:30'), - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}).set_index('time') - result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() - expected = test.groupby( - pd.Grouper(freq='h') - )['data'].apply(pd.Series.nunique) - tm.assert_series_equal(result, expected) - def test_numpy_compat(self): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) @@ -3559,16 +2483,6 @@ def test_numpy_compat(self): tm.assert_raises_regex(UnsupportedFunctionCall, msg, getattr(g, func), foo=1) - def test_grouping_string_repr(self): - # GH 13394 - mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) - df = DataFrame([[1, 2, 3]], columns=mi) - gr = df.groupby(df[('A', 'a')]) - - result = gr.grouper.groupings[0].__repr__() - expected = "Grouping(('A', 'a'))" - assert result == expected - def test_group_shift_with_null_key(self): # This test is designed to replicate the segfault in issue #13813. n_rows = 1200 @@ -3749,19 +2663,6 @@ def predictions(tool): result = df2.groupby('Key').apply(predictions).p1 tm.assert_series_equal(expected, result) - def test_gb_key_len_equal_axis_len(self): - # GH16843 - # test ensures that index and column keys are recognized correctly - # when number of keys equals axis length of groupby - df = pd.DataFrame([['foo', 'bar', 'B', 1], - ['foo', 'bar', 'B', 2], - ['foo', 'baz', 'C', 3]], - columns=['first', 'second', 'third', 'one']) - df = df.set_index(['first', 'second']) - df = df.groupby(['first', 'second', 'third']).size() - assert df.loc[('foo', 'bar', 'B')] == 2 - assert df.loc[('foo', 'baz', 'C')] == 1 - def test_pipe(self): # Test the pipe method of DataFrameGroupBy. # Issue #17871 diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py new file mode 100644 index 0000000000000..824c754a5d753 --- /dev/null +++ b/pandas/tests/groupby/test_grouping.py @@ -0,0 +1,732 @@ +# -*- coding: utf-8 -*- + +""" test where we are determining what we are grouping, or getting groups """ + +import pytest + +from warnings import catch_warnings +from pandas import (date_range, Timestamp, + Index, MultiIndex, DataFrame, Series) +from pandas.util.testing import (assert_panel_equal, assert_frame_equal, + assert_series_equal, assert_almost_equal) +from pandas.compat import lrange + +from pandas import compat +import numpy as np + +import pandas.util.testing as tm +import pandas as pd +from .common import MixIn + + +# selection +# -------------------------------- + +class TestSelection(MixIn): + + def test_select_bad_cols(self): + df = DataFrame([[1, 2]], columns=['A', 'B']) + g = df.groupby('A') + pytest.raises(KeyError, g.__getitem__, ['C']) # g[['C']] + + pytest.raises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']] + with tm.assert_raises_regex(KeyError, '^[^A]+$'): + # A should not be referenced as a bad column... + # will have to rethink regex if you change message! + g[['A', 'C']] + + def test_groupby_duplicated_column_errormsg(self): + # GH7511 + df = DataFrame(columns=['A', 'B', 'A', 'C'], + data=[range(4), range(2, 6), range(0, 8, 2)]) + + pytest.raises(ValueError, df.groupby, 'A') + pytest.raises(ValueError, df.groupby, ['A', 'B']) + + grouped = df.groupby('B') + c = grouped.count() + assert c.columns.nlevels == 1 + assert c.columns.size == 3 + + def test_column_select_via_attr(self): + result = self.df.groupby('A').C.sum() + expected = self.df.groupby('A')['C'].sum() + assert_series_equal(result, expected) + + self.df['mean'] = 1.5 + result = self.df.groupby('A').mean() + expected = self.df.groupby('A').agg(np.mean) + assert_frame_equal(result, expected) + + def test_getitem_list_of_columns(self): + df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': np.random.randn(8)}) + + result = df.groupby('A')[['C', 'D']].mean() + result2 = df.groupby('A')['C', 'D'].mean() + result3 = df.groupby('A')[df.columns[2:4]].mean() + + expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + + def test_getitem_numeric_column_names(self): + # GH #13731 + df = DataFrame({0: list('abcd') * 2, + 2: np.random.randn(8), + 4: np.random.randn(8), + 6: np.random.randn(8)}) + result = df.groupby(0)[df.columns[1:3]].mean() + result2 = df.groupby(0)[2, 4].mean() + result3 = df.groupby(0)[[2, 4]].mean() + + expected = df.loc[:, [0, 2, 4]].groupby(0).mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + + +# grouping +# -------------------------------- + +class TestGrouping(MixIn): + + def test_grouper_index_types(self): + # related GH5375 + # groupby misbehaving when using a Floatlike index + df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB')) + for index in [tm.makeFloatIndex, tm.makeStringIndex, + tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, + tm.makePeriodIndex]: + + df.index = index(len(df)) + df.groupby(list('abcde')).apply(lambda x: x) + + df.index = list(reversed(df.index.tolist())) + df.groupby(list('abcde')).apply(lambda x: x) + + def test_grouper_multilevel_freq(self): + + # GH 7885 + # with level and freq specified in a pd.Grouper + from datetime import date, timedelta + d0 = date.today() - timedelta(days=14) + dates = date_range(d0, date.today()) + date_index = pd.MultiIndex.from_product( + [dates, dates], names=['foo', 'bar']) + df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) + + # Check string level + expected = df.reset_index().groupby([pd.Grouper( + key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum() + # reset index changes columns dtype to object + expected.columns = pd.Index([0], dtype='int64') + + result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper( + level='bar', freq='W')]).sum() + assert_frame_equal(result, expected) + + # Check integer level + result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper( + level=1, freq='W')]).sum() + assert_frame_equal(result, expected) + + def test_grouper_creation_bug(self): + + # GH 8795 + df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) + g = df.groupby('A') + expected = g.sum() + + g = df.groupby(pd.Grouper(key='A')) + result = g.sum() + assert_frame_equal(result, expected) + + result = g.apply(lambda x: x.sum()) + assert_frame_equal(result, expected) + + g = df.groupby(pd.Grouper(key='A', axis=0)) + result = g.sum() + assert_frame_equal(result, expected) + + # GH14334 + # pd.Grouper(key=...) may be passed in a list + df = DataFrame({'A': [0, 0, 0, 1, 1, 1], + 'B': [1, 1, 2, 2, 3, 3], + 'C': [1, 2, 3, 4, 5, 6]}) + # Group by single column + expected = df.groupby('A').sum() + g = df.groupby([pd.Grouper(key='A')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group by two columns + # using a combination of strings and Grouper objects + expected = df.groupby(['A', 'B']).sum() + + # Group with two Grouper objects + g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group with a string and a Grouper object + g = df.groupby(['A', pd.Grouper(key='B')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group with a Grouper object and a string + g = df.groupby([pd.Grouper(key='A'), 'B']) + result = g.sum() + assert_frame_equal(result, expected) + + # GH8866 + s = Series(np.arange(8, dtype='int64'), + index=pd.MultiIndex.from_product( + [list('ab'), range(2), + date_range('20130101', periods=2)], + names=['one', 'two', 'three'])) + result = s.groupby(pd.Grouper(level='three', freq='M')).sum() + expected = Series([28], index=Index( + [Timestamp('2013-01-31')], freq='M', name='three')) + assert_series_equal(result, expected) + + # just specifying a level breaks + result = s.groupby(pd.Grouper(level='one')).sum() + expected = s.groupby(level='one').sum() + assert_series_equal(result, expected) + + def test_grouper_column_and_index(self): + # GH 14327 + + # Grouping a multi-index frame by a column and an index level should + # be equivalent to resetting the index and grouping by two columns + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), + ('b', 1), ('b', 2), ('b', 3)]) + idx.names = ['outer', 'inner'] + df_multi = pd.DataFrame({"A": np.arange(6), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one']}, + index=idx) + result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_multi.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_multi.reset_index().groupby(['inner', 'B']).mean() + assert_frame_equal(result, expected) + + # Grouping a single-index frame by a column and the index should + # be equivalent to resetting the index and grouping by two columns + df_single = df_multi.reset_index('outer') + result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_single.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_single.reset_index().groupby(['inner', 'B']).mean() + assert_frame_equal(result, expected) + + def test_groupby_levels_and_columns(self): + # GH9344, GH9049 + idx_names = ['x', 'y'] + idx = pd.MultiIndex.from_tuples( + [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) + df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) + + by_levels = df.groupby(level=idx_names).mean() + # reset_index changes columns dtype to object + by_columns = df.reset_index().groupby(idx_names).mean() + + tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) + + by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) + tm.assert_frame_equal(by_levels, by_columns) + + def test_grouper_getting_correct_binner(self): + + # GH 10063 + # using a non-time-based grouper and a time-based grouper + # and specifying levels + df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product( + [list('ab'), date_range('20130101', periods=80)], names=['one', + 'two'])) + result = df.groupby([pd.Grouper(level='one'), pd.Grouper( + level='two', freq='M')]).sum() + expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]}, + index=MultiIndex.from_product( + [list('ab'), + date_range('20130101', freq='M', periods=3)], + names=['one', 'two'])) + assert_frame_equal(result, expected) + + def test_grouper_iter(self): + assert sorted(self.df.groupby('A').grouper) == ['bar', 'foo'] + + def test_empty_groups(self): + # see gh-1048 + pytest.raises(ValueError, self.df.groupby, []) + + def test_groupby_grouper(self): + grouped = self.df.groupby('A') + + result = self.df.groupby(grouped.grouper).mean() + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_groupby_dict_mapping(self): + # GH #679 + from pandas import Series + s = Series({'T1': 5}) + result = s.groupby({'T1': 'T2'}).agg(sum) + expected = s.groupby(['T2']).agg(sum) + assert_series_equal(result, expected) + + s = Series([1., 2., 3., 4.], index=list('abcd')) + mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} + + result = s.groupby(mapping).mean() + result2 = s.groupby(mapping).agg(np.mean) + expected = s.groupby([0, 0, 1, 1]).mean() + expected2 = s.groupby([0, 0, 1, 1]).mean() + assert_series_equal(result, expected) + assert_series_equal(result, result2) + assert_series_equal(result, expected2) + + def test_groupby_grouper_f_sanity_checked(self): + dates = date_range('01-Jan-2013', periods=12, freq='MS') + ts = Series(np.random.randn(12), index=dates) + + # GH3035 + # index.map is used to apply grouper to the index + # if it fails on the elements, map tries it on the entire index as + # a sequence. That can yield invalid results that cause trouble + # down the line. + # the surprise comes from using key[0:6] rather then str(key)[0:6] + # when the elements are Timestamp. + # the result is Index[0:6], very confusing. + + pytest.raises(AssertionError, ts.groupby, lambda key: key[0:6]) + + def test_grouping_error_on_multidim_input(self): + from pandas.core.groupby import Grouping + pytest.raises(ValueError, + Grouping, self.df.index, self.df[['A', 'A']]) + + def test_multiindex_passthru(self): + + # GH 7997 + # regression from 0.14.1 + df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) + + result = df.groupby(axis=1, level=[0, 1]).first() + assert_frame_equal(result, df) + + def test_multiindex_negative_level(self): + # GH 13901 + result = self.mframe.groupby(level=-1).sum() + expected = self.mframe.groupby(level='second').sum() + assert_frame_equal(result, expected) + + result = self.mframe.groupby(level=-2).sum() + expected = self.mframe.groupby(level='first').sum() + assert_frame_equal(result, expected) + + result = self.mframe.groupby(level=[-2, -1]).sum() + expected = self.mframe + assert_frame_equal(result, expected) + + result = self.mframe.groupby(level=[-1, 'first']).sum() + expected = self.mframe.groupby(level=['second', 'first']).sum() + assert_frame_equal(result, expected) + + def test_multifunc_select_col_integer_cols(self): + df = self.df + df.columns = np.arange(len(df.columns)) + + # it works! + df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) + + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level(self, sort): + # GH 17537 + frame = self.mframe + deleveled = frame.reset_index() + + result0 = frame.groupby(level=0, sort=sort).sum() + result1 = frame.groupby(level=1, sort=sort).sum() + + expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() + expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() + + expected0.index.name = 'first' + expected1.index.name = 'second' + + assert result0.index.name == 'first' + assert result1.index.name == 'second' + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + assert result0.index.name == frame.index.names[0] + assert result1.index.name == frame.index.names[1] + + # groupby level name + result0 = frame.groupby(level='first', sort=sort).sum() + result1 = frame.groupby(level='second', sort=sort).sum() + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + # axis=1 + + result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() + result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() + assert_frame_equal(result0, expected0.T) + assert_frame_equal(result1, expected1.T) + + # raise exception for non-MultiIndex + pytest.raises(ValueError, self.df.groupby, level=1) + + def test_groupby_level_index_names(self): + # GH4014 this used to raise ValueError since 'exp'>1 (in py2) + df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3, + 'var1': lrange(6), }).set_index('exp') + df.groupby(level='exp') + pytest.raises(ValueError, df.groupby, level='foo') + + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level_with_nas(self, sort): + # GH 17537 + index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], + labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, + 2, 3]]) + + # factorizing doesn't confuse things + s = Series(np.arange(8.), index=index) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 22.], index=[0, 1]) + assert_series_equal(result, expected) + + index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], + labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, + 1, 2, 3]]) + + # factorizing doesn't confuse things + s = Series(np.arange(8.), index=index) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 18.], index=[0.0, 1.0]) + assert_series_equal(result, expected) + + def test_groupby_args(self): + # PR8618 and issue 8015 + frame = self.mframe + + def j(): + frame.groupby() + + tm.assert_raises_regex(TypeError, "You have to supply one of " + "'by' and 'level'", j) + + def k(): + frame.groupby(by=None, level=None) + + tm.assert_raises_regex(TypeError, "You have to supply one of " + "'by' and 'level'", k) + + @pytest.mark.parametrize('sort,labels', [ + [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], + [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] + ]) + def test_level_preserve_order(self, sort, labels): + # GH 17537 + grouped = self.mframe.groupby(level=0, sort=sort) + exp_labels = np.array(labels, np.intp) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + def test_grouping_labels(self): + grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + +# get_group +# -------------------------------- + +class TestGetGroup(MixIn): + + def test_get_group(self): + with catch_warnings(record=True): + wp = tm.makePanel() + grouped = wp.groupby(lambda x: x.month, axis='major') + + gp = grouped.get_group(1) + expected = wp.reindex( + major=[x for x in wp.major_axis if x.month == 1]) + assert_panel_equal(gp, expected) + + # GH 5267 + # be datelike friendly + df = DataFrame({'DATE': pd.to_datetime( + ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013', + '11-Oct-2013', '11-Oct-2013']), + 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'], + 'VAL': [1, 2, 3, 4, 5, 6]}) + + g = df.groupby('DATE') + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group(Timestamp(key).to_pydatetime()) + result3 = g.get_group(str(Timestamp(key))) + assert_frame_equal(result1, result2) + assert_frame_equal(result1, result3) + + g = df.groupby(['DATE', 'label']) + + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1])) + result3 = g.get_group((str(Timestamp(key[0])), key[1])) + assert_frame_equal(result1, result2) + assert_frame_equal(result1, result3) + + # must pass a same-length tuple with multiple keys + pytest.raises(ValueError, lambda: g.get_group('foo')) + pytest.raises(ValueError, lambda: g.get_group(('foo'))) + pytest.raises(ValueError, + lambda: g.get_group(('foo', 'bar', 'baz'))) + + def test_get_group_empty_bins(self): + + d = pd.DataFrame([3, 1, 7, 6]) + bins = [0, 5, 10, 15] + g = d.groupby(pd.cut(d[0], bins)) + + # TODO: should prob allow a str of Interval work as well + # IOW '(0, 5]' + result = g.get_group(pd.Interval(0, 5)) + expected = DataFrame([3, 1], index=[0, 1]) + assert_frame_equal(result, expected) + + pytest.raises(KeyError, lambda: g.get_group(pd.Interval(10, 15))) + + def test_get_group_grouped_by_tuple(self): + # GH 8121 + df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T + gr = df.groupby('ids') + expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2]) + result = gr.get_group((1, )) + assert_frame_equal(result, expected) + + dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01', + '2010-01-02']) + df = DataFrame({'ids': [(x, ) for x in dt]}) + gr = df.groupby('ids') + result = gr.get_group(('2010-01-01', )) + expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) + assert_frame_equal(result, expected) + + def test_groupby_with_empty(self): + index = pd.DatetimeIndex(()) + data = () + series = pd.Series(data, index) + grouper = pd.Grouper(freq='D') + grouped = series.groupby(grouper) + assert next(iter(grouped), None) is None + + def test_groupby_with_single_column(self): + df = pd.DataFrame({'a': list('abssbab')}) + tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) + # GH 13530 + exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a')) + tm.assert_frame_equal(df.groupby('a').count(), exp) + tm.assert_frame_equal(df.groupby('a').sum(), exp) + tm.assert_frame_equal(df.groupby('a').nth(1), exp) + + def test_gb_key_len_equal_axis_len(self): + # GH16843 + # test ensures that index and column keys are recognized correctly + # when number of keys equals axis length of groupby + df = pd.DataFrame([['foo', 'bar', 'B', 1], + ['foo', 'bar', 'B', 2], + ['foo', 'baz', 'C', 3]], + columns=['first', 'second', 'third', 'one']) + df = df.set_index(['first', 'second']) + df = df.groupby(['first', 'second', 'third']).size() + assert df.loc[('foo', 'bar', 'B')] == 2 + assert df.loc[('foo', 'baz', 'C')] == 1 + + +# groups & iteration +# -------------------------------- + +class TestIteration(MixIn): + + def test_groups(self): + grouped = self.df.groupby(['A']) + groups = grouped.groups + assert groups is grouped.groups # caching works + + for k, v in compat.iteritems(grouped.groups): + assert (self.df.loc[v]['A'] == k).all() + + grouped = self.df.groupby(['A', 'B']) + groups = grouped.groups + assert groups is grouped.groups # caching works + + for k, v in compat.iteritems(grouped.groups): + assert (self.df.loc[v]['A'] == k[0]).all() + assert (self.df.loc[v]['B'] == k[1]).all() + + def test_grouping_is_iterable(self): + # this code path isn't used anywhere else + # not sure it's useful + grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year + ]) + + # test it works + for g in grouped.grouper.groupings[0]: + pass + + def test_multi_iter(self): + s = Series(np.arange(6)) + k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + + grouped = s.groupby([k1, k2]) + + iterated = list(grouped) + expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]), + ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + assert e1 == one + assert e2 == two + assert_series_equal(three, e3) + + def test_multi_iter_frame(self): + k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + df = DataFrame({'v1': np.random.randn(6), + 'v2': np.random.randn(6), + 'k1': k1, 'k2': k2}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + + grouped = df.groupby(['k1', 'k2']) + + # things get sorted! + iterated = list(grouped) + idx = df.index + expected = [('a', '1', df.loc[idx[[4]]]), + ('a', '2', df.loc[idx[[3, 5]]]), + ('b', '1', df.loc[idx[[0, 2]]]), + ('b', '2', df.loc[idx[[1]]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + assert e1 == one + assert e2 == two + assert_frame_equal(three, e3) + + # don't iterate through groups with no data + df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) + grouped = df.groupby(['k1', 'k2']) + groups = {} + for key, gp in grouped: + groups[key] = gp + assert len(groups) == 2 + + # axis = 1 + three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() + grouped = three_levels.T.groupby(axis=1, level=(1, 2)) + for key, group in grouped: + pass + + def test_multi_iter_panel(self): + with catch_warnings(record=True): + wp = tm.makePanel() + grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()], + axis=1) + + for (month, wd), group in grouped: + exp_axis = [x + for x in wp.major_axis + if x.month == month and x.weekday() == wd] + expected = wp.reindex(major=exp_axis) + assert_panel_equal(group, expected) + + def test_dictify(self): + dict(iter(self.df.groupby('A'))) + dict(iter(self.df.groupby(['A', 'B']))) + dict(iter(self.df['C'].groupby(self.df['A']))) + dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) + dict(iter(self.df.groupby('A')['C'])) + dict(iter(self.df.groupby(['A', 'B'])['C'])) + + def test_groupby_with_small_elem(self): + # GH 8542 + # length=2 + df = pd.DataFrame({'event': ['start', 'start'], + 'change': [1234, 5678]}, + index=pd.DatetimeIndex(['2014-09-10', '2013-10-10'])) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + assert len(grouped.groups) == 2 + assert grouped.ngroups == 2 + assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups + assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + + res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + tm.assert_frame_equal(res, df.iloc[[0], :]) + res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[1], :]) + + df = pd.DataFrame({'event': ['start', 'start', 'start'], + 'change': [1234, 5678, 9123]}, + index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', + '2014-09-15'])) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + assert len(grouped.groups) == 2 + assert grouped.ngroups == 2 + assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups + assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + + res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + tm.assert_frame_equal(res, df.iloc[[0, 2], :]) + res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[1], :]) + + # length=3 + df = pd.DataFrame({'event': ['start', 'start', 'start'], + 'change': [1234, 5678, 9123]}, + index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', + '2014-08-05'])) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + assert len(grouped.groups) == 3 + assert grouped.ngroups == 3 + assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups + assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups + + res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + tm.assert_frame_equal(res, df.iloc[[0], :]) + res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[1], :]) + res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[2], :]) + + def test_grouping_string_repr(self): + # GH 13394 + mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) + df = DataFrame([[1, 2, 3]], columns=mi) + gr = df.groupby(df[('A', 'a')]) + + result = gr.grouper.groupings[0].__repr__() + expected = "Grouping(('A', 'a'))" + assert result == expected diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index ffbede0eb208f..501fe63137cf4 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -238,6 +238,84 @@ def test_nth_multi_index_as_expected(self): names=['A', 'B'])) assert_frame_equal(result, expected) + def test_groupby_head_tail(self): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False, much easier + assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) + + empty_not_as = DataFrame(columns=df.columns, + index=pd.Index([], dtype=df.index.dtype)) + empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_not_as, g_not_as.head(0)) + assert_frame_equal(empty_not_as, g_not_as.tail(0)) + assert_frame_equal(empty_not_as, g_not_as.head(-1)) + assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + assert_frame_equal(df, g_not_as.head(7)) # contains all + assert_frame_equal(df, g_not_as.tail(7)) + + # as_index=True, (used to be different) + df_as = df + + assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_as, g_as.head(0)) + assert_frame_equal(empty_as, g_as.tail(0)) + assert_frame_equal(empty_as, g_as.head(-1)) + assert_frame_equal(empty_as, g_as.tail(-1)) + + assert_frame_equal(df_as, g_as.head(7)) # contains all + assert_frame_equal(df_as, g_as.tail(7)) + + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + def test_group_selection_cache(self): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + expected = df.iloc[[0, 2]].set_index('A') + + g = df.groupby('A') + result1 = g.head(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.tail(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.head(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.tail(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + def test_nth_empty(): # GH 16064 diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 4b821dade6eae..c0ea968ab0819 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -474,32 +474,41 @@ def test_cython_group_transform_algos(self): np.timedelta64(5, 'ns')]) tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) - def test_cython_transform(self): + @pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) + def test_cython_transform_series(self, op, args, targop): # GH 4095 - ops = [(('cumprod', - ()), lambda x: x.cumprod()), (('cumsum', ()), - lambda x: x.cumsum()), - (('shift', (-1, )), - lambda x: x.shift(-1)), (('shift', - (1, )), lambda x: x.shift())] - s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) # series - for (op, args), targop in ops: - for data in [s, s_missing]: - # print(data.head()) - expected = data.groupby(labels).transform(targop) - - tm.assert_series_equal(expected, - data.groupby(labels).transform(op, - *args)) - tm.assert_series_equal(expected, getattr( - data.groupby(labels), op)(*args)) - + for data in [s, s_missing]: + # print(data.head()) + expected = data.groupby(labels).transform(targop) + + tm.assert_series_equal( + expected, + data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr( + data.groupby(labels), op)(*args)) + + @pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) + def test_cython_transform_frame(self, op, args, targop): + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) strings = list('qwertyuiopasdfghjklz') strings_missing = strings[:] strings_missing[5] = np.nan @@ -530,34 +539,33 @@ def test_cython_transform(self): if op == 'shift': gb._set_group_selection() - for (op, args), targop in ops: - if op != 'shift' and 'int' not in gb_target: - # numeric apply fastpath promotes dtype so have - # to apply separately and concat - i = gb[['int']].apply(targop) - f = gb[['float', 'float_missing']].apply(targop) - expected = pd.concat([f, i], axis=1) + if op != 'shift' and 'int' not in gb_target: + # numeric apply fastpath promotes dtype so have + # to apply separately and concat + i = gb[['int']].apply(targop) + f = gb[['float', 'float_missing']].apply(targop) + expected = pd.concat([f, i], axis=1) + else: + expected = gb.apply(targop) + + expected = expected.sort_index(axis=1) + tm.assert_frame_equal(expected, + gb.transform(op, *args).sort_index( + axis=1)) + tm.assert_frame_equal(expected, getattr(gb, op)(*args)) + # individual columns + for c in df: + if c not in ['float', 'int', 'float_missing' + ] and op != 'shift': + pytest.raises(DataError, gb[c].transform, op) + pytest.raises(DataError, getattr(gb[c], op)) else: - expected = gb.apply(targop) - - expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, - gb.transform(op, *args).sort_index( - axis=1)) - tm.assert_frame_equal(expected, getattr(gb, op)(*args)) - # individual columns - for c in df: - if c not in ['float', 'int', 'float_missing' - ] and op != 'shift': - pytest.raises(DataError, gb[c].transform, op) - pytest.raises(DataError, getattr(gb[c], op)) - else: - expected = gb[c].apply(targop) - expected.name = c - tm.assert_series_equal(expected, - gb[c].transform(op, *args)) - tm.assert_series_equal(expected, - getattr(gb[c], op)(*args)) + expected = gb[c].apply(targop) + expected.name = c + tm.assert_series_equal(expected, + gb[c].transform(op, *args)) + tm.assert_series_equal(expected, + getattr(gb[c], op)(*args)) def test_transform_with_non_scalar_group(self): # GH 10165 diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index b70a03ec3a1d3..3d7977c63eeb6 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -1,3 +1,9 @@ +""" +these are systematically testing all of the args to value_counts +with different size combinations. This is to ensure stability of the sorting +and proper parameter handling +""" + import pytest from itertools import product @@ -7,55 +13,64 @@ from pandas import MultiIndex, DataFrame, Series, date_range -@pytest.mark.slow -@pytest.mark.parametrize("n,m", product((100, 1000), (5, 20))) -def test_series_groupby_value_counts(n, m): +# our starting frame +def seed_df(seed_nans, n, m): np.random.seed(1234) + days = date_range('2015-08-24', periods=10) - def rebuild_index(df): - arr = list(map(df.index.get_level_values, range(df.index.nlevels))) - df.index = MultiIndex.from_arrays(arr, names=df.index.names) - return df - - def check_value_counts(df, keys, bins): - for isort, normalize, sort, ascending, dropna \ - in product((False, True), repeat=5): + frame = DataFrame({ + '1st': np.random.choice( + list('abcd'), n), + '2nd': np.random.choice(days, n), + '3rd': np.random.randint(1, m + 1, n) + }) - kwargs = dict(normalize=normalize, sort=sort, - ascending=ascending, dropna=dropna, bins=bins) + if seed_nans: + frame.loc[1::11, '1st'] = np.nan + frame.loc[3::17, '2nd'] = np.nan + frame.loc[7::19, '3rd'] = np.nan + frame.loc[8::19, '3rd'] = np.nan + frame.loc[9::19, '3rd'] = np.nan - gr = df.groupby(keys, sort=isort) - left = gr['3rd'].value_counts(**kwargs) + return frame - gr = df.groupby(keys, sort=isort) - right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ['3rd'] - # have to sort on index because of unstable sort on values - left, right = map(rebuild_index, (left, right)) # xref GH9212 - tm.assert_series_equal(left.sort_index(), right.sort_index()) +# create input df, keys, and the bins +binned = [] +ids = [] +for seed_nans in [True, False]: + for n, m in product((100, 1000), (5, 20)): - def loop(df): + df = seed_df(seed_nans, n, m) bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) keys = '1st', '2nd', ('1st', '2nd') for k, b in product(keys, bins): - check_value_counts(df, k, b) + binned.append((df, k, b, n, m)) + ids.append("{}-{}-{}".format(k, n, m)) - days = date_range('2015-08-24', periods=10) - frame = DataFrame({ - '1st': np.random.choice( - list('abcd'), n), - '2nd': np.random.choice(days, n), - '3rd': np.random.randint(1, m + 1, n) - }) +@pytest.mark.slow +@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) +def test_series_groupby_value_counts(df, keys, bins, n, m): + + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + for isort, normalize, sort, ascending, dropna \ + in product((False, True), repeat=5): + + kwargs = dict(normalize=normalize, sort=sort, + ascending=ascending, dropna=dropna, bins=bins) - loop(frame) + gr = df.groupby(keys, sort=isort) + left = gr['3rd'].value_counts(**kwargs) - frame.loc[1::11, '1st'] = np.nan - frame.loc[3::17, '2nd'] = np.nan - frame.loc[7::19, '3rd'] = np.nan - frame.loc[8::19, '3rd'] = np.nan - frame.loc[9::19, '3rd'] = np.nan + gr = df.groupby(keys, sort=isort) + right = gr['3rd'].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ['3rd'] - loop(frame) + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index())