diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py new file mode 100644 index 0000000000000..6b162b71f79de --- /dev/null +++ b/pandas/tests/groupby/test_aggregate.py @@ -0,0 +1,494 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +import nose +from datetime import datetime + + +from pandas import date_range +from pandas.core.index import MultiIndex +from pandas.core.api import DataFrame + +from pandas.core.series import Series + +from pandas.util.testing import (assert_frame_equal, assert_series_equal + ) + +from pandas.core.groupby import (SpecificationError) +from pandas.compat import (lmap, OrderedDict) +from pandas.formats.printing import pprint_thing + +from pandas import compat + +import pandas.core.common as com +import numpy as np + +import pandas.util.testing as tm +import pandas as pd + + +class TestGroupByAggregate(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_agg_api(self): + + # GH 6337 + # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # different api for agg when passed custom function with mixed frame + + df = DataFrame({'data1': np.random.randn(5), + 'data2': np.random.randn(5), + 'key1': ['a', 'a', 'b', 'b', 'a'], + 'key2': ['one', 'two', 'one', 'two', 'one']}) + grouped = df.groupby('key1') + + def peak_to_peak(arr): + return arr.max() - arr.min() + + expected = grouped.agg([peak_to_peak]) + expected.columns = ['data1', 'data2'] + result = grouped.agg(peak_to_peak) + assert_frame_equal(result, expected) + + def test_agg_regression1(self): + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_agg_datetimes_mixed(self): + data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] + + df1 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] + else None, row[2]] for row in data] + + df2 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + df1['weights'] = df1['value'] / df1['value'].sum() + gb1 = df1.groupby('date').aggregate(np.sum) + + df2['weights'] = df1['value'] / df1['value'].sum() + gb2 = df2.groupby('date').aggregate(np.sum) + + assert (len(gb1) == len(gb2)) + + def test_agg_period_index(self): + from pandas import period_range, PeriodIndex + prng = period_range('2012-1-1', freq='M', periods=3) + df = DataFrame(np.random.randn(3, 2), index=prng) + rs = df.groupby(level=0).sum() + tm.assertIsInstance(rs.index, PeriodIndex) + + # GH 3579 + index = period_range(start='1999-01', periods=5, freq='M') + s1 = Series(np.random.rand(len(index)), index=index) + s2 = Series(np.random.rand(len(index)), index=index) + series = [('s1', s1), ('s2', s2)] + df = DataFrame.from_items(series) + grouped = df.groupby(df.index.month) + list(grouped) + + def test_agg_dict_parameter_cast_result_dtypes(self): + # GH 12821 + + df = DataFrame( + {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], + 'time': date_range('1/1/2011', periods=8, freq='H')}) + df.loc[[0, 1, 2, 5], 'time'] = None + + # test for `first` function + exp = df.loc[[0, 3, 4, 6]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.first(), exp) + assert_frame_equal(grouped.agg('first'), exp) + assert_frame_equal(grouped.agg({'time': 'first'}), exp) + assert_series_equal(grouped.time.first(), exp['time']) + assert_series_equal(grouped.time.agg('first'), exp['time']) + + # test for `last` function + exp = df.loc[[0, 3, 4, 7]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.last(), exp) + assert_frame_equal(grouped.agg('last'), exp) + assert_frame_equal(grouped.agg({'time': 'last'}), exp) + assert_series_equal(grouped.time.last(), exp['time']) + assert_series_equal(grouped.time.agg('last'), exp['time']) + + def test_agg_must_agg(self): + grouped = self.df.groupby('A')['C'] + self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) + self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) + + def test_agg_ser_multi_key(self): + # TODO(wesm): unused + ser = self.df.C # noqa + + f = lambda x: x.sum() + results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) + expected = self.df.groupby(['A', 'B']).sum()['C'] + assert_series_equal(results, expected) + + def test_agg_apply_corner(self): + # nothing to group, all NA + grouped = self.ts.groupby(self.ts * np.nan) + self.assertEqual(self.ts.dtype, np.float64) + + # groupby float64 values results in Float64Index + exp = Series([], dtype=np.float64, index=pd.Index( + [], dtype=np.float64)) + assert_series_equal(grouped.sum(), exp) + assert_series_equal(grouped.agg(np.sum), exp) + assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) + + # DataFrame + grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) + exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, + index=pd.Index([], dtype=np.float64)) + assert_frame_equal(grouped.sum(), exp_df, check_names=False) + assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) + assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], + check_names=False) + + def test_agg_grouping_is_list_tuple(self): + from pandas.core.groupby import Grouping + + df = tm.makeTimeDataFrame() + + grouped = df.groupby(lambda x: x.year) + grouper = grouped.grouper.groupings[0].grouper + grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_aggregate_api_consistency(self): + # GH 9052 + # make sure that the aggregates via dict + # are consistent + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + grouped = df.groupby(['A', 'B']) + c_mean = grouped['C'].mean() + c_sum = grouped['C'].sum() + d_mean = grouped['D'].mean() + d_sum = grouped['D'].sum() + + result = grouped['D'].agg(['sum', 'mean']) + expected = pd.concat([d_sum, d_mean], + axis=1) + expected.columns = ['sum', 'mean'] + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg([np.sum, np.mean]) + expected = pd.concat([c_sum, + c_mean, + d_sum, + d_mean], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) + + result = grouped[['D', 'C']].agg([np.sum, np.mean]) + expected = pd.concat([d_sum, + d_mean, + c_sum, + c_mean], + axis=1) + expected.columns = MultiIndex.from_product([['D', 'C'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': 'mean', 'D': 'sum'}) + expected = pd.concat([d_sum, + c_mean], + axis=1) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': ['mean', 'sum'], + 'D': ['mean', 'sum']}) + expected = pd.concat([c_mean, + c_sum, + d_mean, + d_sum], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['mean', 'sum']]) + + result = grouped[['D', 'C']].agg({'r': np.sum, + 'r2': np.mean}) + expected = pd.concat([d_sum, + c_sum, + d_mean, + c_mean], + axis=1) + expected.columns = MultiIndex.from_product([['r', 'r2'], + ['D', 'C']]) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_compat(self): + + # GH 12334 + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + g = df.groupby(['A', 'B']) + + expected = pd.concat([g['D'].sum(), + g['D'].std()], + axis=1) + expected.columns = MultiIndex.from_tuples([('C', 'sum'), + ('C', 'std')]) + result = g['D'].agg({'C': ['sum', 'std']}) + assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([g['D'].sum(), + g['D'].std()], + axis=1) + expected.columns = ['C', 'D'] + result = g['D'].agg({'C': 'sum', 'D': 'std'}) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_nested_dicts(self): + + # API change for disallowing these types of nested dicts + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + g = df.groupby(['A', 'B']) + + def f(): + g.aggregate({'r1': {'C': ['mean', 'sum']}, + 'r2': {'D': ['mean', 'sum']}}) + + self.assertRaises(SpecificationError, f) + + result = g.agg({'C': {'ra': ['mean', 'std']}, + 'D': {'rb': ['mean', 'std']}}) + expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), + g['D'].std()], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( + 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) + assert_frame_equal(result, expected, check_like=True) + + # same name as the original column + # GH9052 + expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) + expected = expected.rename(columns={'result1': 'D'}) + result = g['D'].agg({'D': np.sum, 'result2': np.mean}) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_python_multiindex(self): + grouped = self.mframe.groupby(['A', 'B']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_aggregate_str_func(self): + def _check_results(grouped): + # single series + result = grouped['A'].agg('std') + expected = grouped['A'].std() + assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate('var') + expected = grouped.var() + assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], + ['C', 'mean'], ['D', 'sem']])) + expected = DataFrame(OrderedDict([['A', grouped['A'].var( + )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) + assert_frame_equal(result, expected) + + by_weekday = self.tsframe.groupby(lambda x: x.weekday()) + _check_results(by_weekday) + + by_mwkday = self.tsframe.groupby([lambda x: x.month, + lambda x: x.weekday()]) + _check_results(by_mwkday) + + def test_aggregate_item_by_item(self): + + df = self.df.copy() + df['E'] = ['a'] * len(self.df) + grouped = self.df.groupby('A') + + # API change in 0.11 + # def aggfun(ser): + # return len(ser + 'a') + # result = grouped.agg(aggfun) + # self.assertEqual(len(result.columns), 1) + + aggfun = lambda ser: ser.size + result = grouped.agg(aggfun) + foo = (self.df.A == 'foo').sum() + bar = (self.df.A == 'bar').sum() + K = len(result.columns) + + # GH5782 + # odd comparisons can result here, so cast to make easy + exp = pd.Series(np.array([foo] * K), index=list('BCD'), + dtype=np.float64, name='foo') + tm.assert_series_equal(result.xs('foo'), exp) + + exp = pd.Series(np.array([bar] * K), index=list('BCD'), + dtype=np.float64, name='bar') + tm.assert_almost_equal(result.xs('bar'), exp) + + def aggfun(ser): + return ser.size + + result = DataFrame().groupby(self.df.A).agg(aggfun) + tm.assertIsInstance(result, DataFrame) + self.assertEqual(len(result), 0) + + def test_agg_item_by_item_raise_typeerror(self): + from numpy.random import randint + + df = DataFrame(randint(10, size=(20, 10))) + + def raiseException(df): + pprint_thing('----------------------------------------') + pprint_thing(df.to_string()) + raise TypeError + + self.assertRaises(TypeError, df.groupby(0).agg, raiseException) + + def test_series_agg_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_series_equal(result, expected) + + def test_series_agg_multi_pure_python(self): + data = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def bad(x): + assert (len(x.base) > 0) + return 'foo' + + result = data.groupby(['A', 'B']).agg(bad) + expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') + assert_frame_equal(result, expected) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) + + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' + ], exit=False) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py new file mode 100644 index 0000000000000..d3c7bc2adbb4a --- /dev/null +++ b/pandas/tests/groupby/test_categorical.py @@ -0,0 +1,467 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +import nose +from numpy import nan + + +from pandas.core.index import Index, MultiIndex, CategoricalIndex +from pandas.core.api import DataFrame, Categorical + +from pandas.core.series import Series + +from pandas.util.testing import (assert_frame_equal, assert_series_equal + ) + +from pandas.compat import (lmap) + +from pandas import compat + +import pandas.core.common as com +import numpy as np + +import pandas.util.testing as tm +import pandas as pd + + +class TestGroupByCategorical(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_apply_use_categorical_name(self): + from pandas import qcut + cats = qcut(self.df.C, 4) + + def get_stats(group): + return {'min': group.min(), + 'max': group.max(), + 'count': group.count(), + 'mean': group.mean()} + + result = self.df.groupby(cats).D.apply(get_stats) + self.assertEqual(result.index.names[0], 'C') + + def test_apply_categorical_data(self): + # GH 10138 + for ordered in [True, False]: + dense = Categorical(list('abc'), ordered=ordered) + # 'b' is in the categories but not in the list + missing = Categorical( + list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_product( + [Categorical(['a', 'b'], ordered=ordered), + Categorical(['a', 'b', 'c'], ordered=ordered)], + names=['missing', 'dense']) + expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], + index=idx, + columns=['values']) + + assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) + assert_frame_equal(grouped.mean(), expected) + assert_frame_equal(grouped.agg(np.mean), expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = Series(1, index=idx) + assert_series_equal(grouped.apply(lambda x: 1), expected) + + def test_groupby_categorical(self): + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) + expected = expected.reindex(exp_idx) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby(exp_cats, sort=False).describe() + expected.index.names = [None, None] + assert_frame_equal(desc_result, expected) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) + exp = CategoricalIndex(expc) + self.assert_index_equal(desc_result.index.get_level_values(0), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + self.assert_index_equal(desc_result.index.get_level_values(1), exp) + + def test_groupby_datetime_categorical(self): + # GH9049: ensure backward compatibility + levels = pd.date_range('2014-01-01', periods=4) + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + expected = expected.reindex(levels) + expected.index = CategoricalIndex(expected.index, + categories=expected.index, + ordered=True) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = cats.take_nd(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels).describe() + expected.index.names = [None, None] + assert_frame_equal(desc_result, expected) + tm.assert_index_equal(desc_result.index, expected.index) + tm.assert_index_equal( + desc_result.index.get_level_values(0), + expected.index.get_level_values(0)) + + # GH 10460 + expc = Categorical.from_codes( + np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) + self.assert_index_equal(desc_result.index.get_level_values(0), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + self.assert_index_equal(desc_result.index.get_level_values(1), exp) + + def test_groupby_categorical_index(self): + + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, ordered=True) + df = DataFrame( + np.repeat( + np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) + df['cats'] = cats + + # with a cat index + result = df.set_index('cats').groupby(level=0).sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby('cats').sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + def test_groupby_describe_categorical_columns(self): + # GH 11558 + cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) + df = DataFrame(np.random.randn(20, 4), columns=cats) + result = df.groupby([1, 2, 3, 4] * 5).describe() + + tm.assert_index_equal(result.columns, cats) + tm.assert_categorical_equal(result.columns.values, cats.values) + + def test_groupby_unstack_categorical(self): + # GH11558 (example is taken from the original issue) + df = pd.DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) + df['medium'] = df['medium'].astype('category') + + gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() + result = gcat.describe() + + exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, + name='medium') + tm.assert_index_equal(result.columns, exp_columns) + tm.assert_categorical_equal(result.columns.values, exp_columns.values) + + result = gcat['A'] + gcat['B'] + expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + tm.assert_series_equal(result, expected) + + def test_groupby_categorical_unequal_len(self): + # GH3011 + series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) + # The raises only happens with categorical, not with series of types + # category + bins = pd.cut(series.dropna().values, 4) + + # len(bins) != len(series) here + self.assertRaises(ValueError, lambda: series.groupby(bins).mean()) + + def test_groupby_categorical_two_columns(self): + + # https://github.com/pandas-dev/pandas/issues/8138 + d = {'cat': + pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), + 'ints': [1, 1, 2, 2], + 'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') + + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) + exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, + index=exp_index) + tm.assert_frame_equal(res, exp) + + # Grouping on two columns + groups_double_key = test.groupby(["cat", "ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), + "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" + ]) + tm.assert_frame_equal(res, exp) + + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = test[(test.cat == c) & (test.ints == i)] + assert_frame_equal(result, expected) + + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = test.groupby([values, 'C2']) + + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product( + [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) + exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, + nan, nan, nan, nan, 4, 5], + "C3": [nan, nan, nan, nan, 10, 100, + nan, nan, nan, nan, 200, 34]}, index=idx) + tm.assert_frame_equal(res, exp) + + def test_groupby_multi_categorical_as_index(self): + # GH13204 + df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), + 'A': [10, 11, 11], + 'B': [101, 102, 103]}) + result = df.groupby(['cat', 'A'], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, 'A'] + result = df.groupby(['cat', f], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') + result = df.groupby(['cat', s], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # is original index dropped? + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + + for name in [None, 'X', 'B', 'cat']: + df.index = Index(list("abc"), name=name) + result = df.groupby(['cat', 'A'], as_index=False).sum() + tm.assert_frame_equal(result, expected, check_index_type=True) + + def test_groupby_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + # single grouper + exp_full = DataFrame({'A': [2.0, 1.0, np.nan], + 'B': [25.0, 20.0, np.nan], + 'C1': Categorical(list("bac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bac"), + categories=list("bac"), + ordered=True)}) + for col in ['C1', 'C2']: + result1 = df.groupby(by=col, as_index=False).mean() + result2 = df.groupby(by=col, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # multiple grouper + exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], + 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, + np.nan], + 'C1': Categorical(list("bacbac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bacbac"), + categories=list("bac"), + ordered=True)}) + for cols in [['A', 'C1'], ['A', 'C2']]: + result1 = df.groupby(by=cols, as_index=False).mean() + result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + def test_groupby_categorical_no_compress(self): + data = Series(np.random.randn(9)) + + codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean() + + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) + cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean().reindex(cats.categories) + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + result = data.groupby("b").mean() + result = result["a"].values + exp = np.array([1, 2, 4, np.nan]) + self.assert_numpy_array_equal(result, exp) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) + + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' + ], exit=False) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py new file mode 100644 index 0000000000000..81bf977e924d8 --- /dev/null +++ b/pandas/tests/groupby/test_filters.py @@ -0,0 +1,635 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +import nose + +from numpy import nan + + +from pandas import Timestamp +from pandas.core.index import MultiIndex +from pandas.core.api import DataFrame + +from pandas.core.series import Series + +from pandas.util.testing import (assert_frame_equal, assert_series_equal + ) +from pandas.compat import (lmap) + +from pandas import compat + +import pandas.core.common as com +import numpy as np + +import pandas.util.testing as tm +import pandas as pd + + +class TestGroupByFilter(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_filter_series(self): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(s.index)) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(s.index)) + + def test_filter_single_column_df(self): + df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + grouper = df[0].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(df.index)) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(df.index)) + + def test_filter_multi_column_df(self): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), + expected) + + def test_filter_mixed_df(self): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 10), expected) + + def test_filter_out_all_groups(self): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]]) + + def test_filter_out_no_groups(self): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + filtered = grouped.filter(lambda x: x.mean() > 0) + assert_series_equal(filtered, s) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + filtered = grouped.filter(lambda x: x['A'].mean() > 0) + assert_frame_equal(filtered, df) + + def test_filter_out_all_groups_in_df(self): + # GH12768 + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) + expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3}) + assert_frame_equal(expected, res) + + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) + expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") + assert_frame_equal(expected, res) + + def test_filter_condition_raises(self): + def raise_if_sum_is_zero(x): + if x.sum() == 0: + raise ValueError + else: + return x.sum() > 0 + + s = pd.Series([-1, 0, 1, 2]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + self.assertRaises(TypeError, + lambda: grouped.filter(raise_if_sum_is_zero)) + + def test_filter_with_axis_in_groupby(self): + # issue 11041 + index = pd.MultiIndex.from_product([range(10), [0, 1]]) + data = pd.DataFrame( + np.arange(100).reshape(-1, 20), columns=index, dtype='int64') + result = data.groupby(level=0, + axis=1).filter(lambda x: x.iloc[0, 0] > 10) + expected = data.iloc[:, 12:20] + assert_frame_equal(result, expected) + + def test_filter_bad_shapes(self): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby('B') + g_s = s.groupby(s) + + f = lambda x: x + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: x == 1 + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: np.outer(x, x) + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + def test_filter_nan_is_false(self): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby(df['B']) + g_s = s.groupby(s) + + f = lambda x: np.nan + assert_frame_equal(g_df.filter(f), df.loc[[]]) + assert_series_equal(g_s.filter(f), s[[]]) + + def test_filter_against_workaround(self): + np.random.seed(0) + # Series of ints + s = Series(np.random.randint(0, 100, 1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Series of floats + s = 100 * Series(np.random.random(1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Set up DataFrame of ints, floats, strings. + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 1000 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), + 'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + + # Group by ints; filter on floats. + grouped = df.groupby('ints') + old_way = df[grouped.floats. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) + assert_frame_equal(new_way, old_way) + + # Group by floats (rounded); filter on strings. + grouper = df.floats.apply(lambda x: np.round(x, -1)) + grouped = df.groupby(grouper) + old_way = df[grouped.letters. + transform(lambda x: len(x) < N / 10).astype('bool')] + new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + assert_frame_equal(new_way, old_way) + + # Group by strings; filter on ints. + grouped = df.groupby('letters') + old_way = df[grouped.ints. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) + assert_frame_equal(new_way, old_way) + + def test_filter_using_len(self): + # BUG GH4447 + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + grouped = df.groupby('B') + actual = grouped.filter(lambda x: len(x) > 2) + expected = DataFrame( + {'A': np.arange(2, 6), + 'B': list('bbbb'), + 'C': np.arange(2, 6)}, index=np.arange(2, 6)) + assert_frame_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = df.ix[[]] + assert_frame_equal(actual, expected) + + # Series have always worked properly, but we'll test anyway. + s = df['B'] + grouped = s.groupby(s) + actual = grouped.filter(lambda x: len(x) > 2) + expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') + assert_series_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = s[[]] + assert_series_equal(actual, expected) + + def test_filter_maintains_ordering(self): + # Simple case: index is sequential. #4621 + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Now index is sequentially decreasing. + df.index = np.arange(len(df) - 1, -1, -1) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Index is shuffled. + SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] + df.index = df.index[SHUFFLED] + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + def test_filter_multiple_timestamp(self): + # GH 10114 + df = DataFrame({'A': np.arange(5, dtype='int64'), + 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], + 'C': Timestamp('20130101')}) + + grouped = df.groupby(['B', 'C']) + + result = grouped['A'].filter(lambda x: True) + assert_series_equal(df['A'], result) + + result = grouped['A'].transform(len) + expected = Series([2, 3, 2, 3, 3], name='A') + assert_series_equal(result, expected) + + result = grouped.filter(lambda x: True) + assert_frame_equal(df, result) + + result = grouped.transform('sum') + expected = DataFrame({'A': [2, 8, 2, 8, 8]}) + assert_frame_equal(result, expected) + + result = grouped.transform(len) + expected = DataFrame({'A': [2, 3, 2, 3, 3]}) + assert_frame_equal(result, expected) + + def test_filter_and_transform_with_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 1, 1, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_multiple_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 0, 0, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_float_index(self): + # GH4620 + index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_timestamp_index(self): + # GH4620 + t0 = Timestamp('2013-09-30 00:05:00') + t1 = Timestamp('2013-10-30 00:05:00') + t2 = Timestamp('2013-11-30 00:05:00') + index = [t1, t1, t1, t2, t1, t1, t0, t1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_string_index(self): + # GH4620 + index = list('bbbcbbab') + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_has_access_to_grouped_cols(self): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x['A'].sum() == 2) + assert_frame_equal(filt, df.iloc[[0, 1]]) + + def test_filter_enforces_scalarness(self): + df = pd.DataFrame([ + ['best', 'a', 'x'], + ['worst', 'b', 'y'], + ['best', 'c', 'x'], + ['best', 'd', 'y'], + ['worst', 'd', 'y'], + ['worst', 'd', 'y'], + ['best', 'd', 'z'], + ], columns=['a', 'b', 'c']) + with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): + df.groupby('c').filter(lambda g: g['a'] == 'best') + + def test_filter_non_bool_raises(self): + df = pd.DataFrame([ + ['best', 'a', 1], + ['worst', 'b', 1], + ['best', 'c', 1], + ['best', 'd', 1], + ['worst', 'd', 1], + ['worst', 'd', 1], + ['best', 'd', 1], + ], columns=['a', 'b', 'c']) + with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): + df.groupby('a').filter(lambda g: g.c.mean()) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) + + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' + ], exit=False) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/groupby/test_groupby.py similarity index 81% rename from pandas/tests/test_groupby.py rename to pandas/tests/groupby/test_groupby.py index 7b98a45395752..97e1f7dc94866 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -14,7 +14,6 @@ _lexsort_indexer) from pandas.core.series import Series from pandas.core.config import option_context -from pandas.formats.printing import pprint_thing from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, assert_index_equal, assertRaisesRegexp) @@ -864,110 +863,6 @@ def f(grp): e.name = None assert_series_equal(result, e) - def test_agg_api(self): - - # GH 6337 - # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error - # different api for agg when passed custom function with mixed frame - - df = DataFrame({'data1': np.random.randn(5), - 'data2': np.random.randn(5), - 'key1': ['a', 'a', 'b', 'b', 'a'], - 'key2': ['one', 'two', 'one', 'two', 'one']}) - grouped = df.groupby('key1') - - def peak_to_peak(arr): - return arr.max() - arr.min() - - expected = grouped.agg([peak_to_peak]) - expected.columns = ['data1', 'data2'] - result = grouped.agg(peak_to_peak) - assert_frame_equal(result, expected) - - def test_agg_regression1(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - def test_agg_datetimes_mixed(self): - data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] - - df1 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] - else None, row[2]] for row in data] - - df2 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - df1['weights'] = df1['value'] / df1['value'].sum() - gb1 = df1.groupby('date').aggregate(np.sum) - - df2['weights'] = df1['value'] / df1['value'].sum() - gb2 = df2.groupby('date').aggregate(np.sum) - - assert (len(gb1) == len(gb2)) - - def test_agg_period_index(self): - from pandas import period_range, PeriodIndex - prng = period_range('2012-1-1', freq='M', periods=3) - df = DataFrame(np.random.randn(3, 2), index=prng) - rs = df.groupby(level=0).sum() - tm.assertIsInstance(rs.index, PeriodIndex) - - # GH 3579 - index = period_range(start='1999-01', periods=5, freq='M') - s1 = Series(np.random.rand(len(index)), index=index) - s2 = Series(np.random.rand(len(index)), index=index) - series = [('s1', s1), ('s2', s2)] - df = DataFrame.from_items(series) - grouped = df.groupby(df.index.month) - list(grouped) - - def test_agg_dict_parameter_cast_result_dtypes(self): - # GH 12821 - - df = DataFrame( - {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], - 'time': date_range('1/1/2011', periods=8, freq='H')}) - df.loc[[0, 1, 2, 5], 'time'] = None - - # test for `first` function - exp = df.loc[[0, 3, 4, 6]].set_index('class') - grouped = df.groupby('class') - assert_frame_equal(grouped.first(), exp) - assert_frame_equal(grouped.agg('first'), exp) - assert_frame_equal(grouped.agg({'time': 'first'}), exp) - assert_series_equal(grouped.time.first(), exp['time']) - assert_series_equal(grouped.time.agg('first'), exp['time']) - - # test for `last` function - exp = df.loc[[0, 3, 4, 7]].set_index('class') - grouped = df.groupby('class') - assert_frame_equal(grouped.last(), exp) - assert_frame_equal(grouped.agg('last'), exp) - assert_frame_equal(grouped.agg({'time': 'last'}), exp) - assert_series_equal(grouped.time.last(), exp['time']) - assert_series_equal(grouped.time.agg('last'), exp['time']) - - def test_agg_must_agg(self): - grouped = self.df.groupby('A')['C'] - self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) - self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) - - def test_agg_ser_multi_key(self): - # TODO(wesm): unused - ser = self.df.C # noqa - - f = lambda x: x.sum() - results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) - expected = self.df.groupby(['A', 'B']).sum()['C'] - assert_series_equal(results, expected) - def test_get_group(self): wp = tm.makePanel() grouped = wp.groupby(lambda x: x.month, axis='major') @@ -1034,58 +929,11 @@ def test_get_group_grouped_by_tuple(self): expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) assert_frame_equal(result, expected) - def test_agg_apply_corner(self): - # nothing to group, all NA - grouped = self.ts.groupby(self.ts * np.nan) - self.assertEqual(self.ts.dtype, np.float64) - - # groupby float64 values results in Float64Index - exp = Series([], dtype=np.float64, index=pd.Index( - [], dtype=np.float64)) - assert_series_equal(grouped.sum(), exp) - assert_series_equal(grouped.agg(np.sum), exp) - assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) - - # DataFrame - grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) - exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, - index=pd.Index([], dtype=np.float64)) - assert_frame_equal(grouped.sum(), exp_df, check_names=False) - assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], - check_names=False) - - def test_agg_grouping_is_list_tuple(self): - from pandas.core.groupby import Grouping - - df = tm.makeTimeDataFrame() - - grouped = df.groupby(lambda x: x.year) - grouper = grouped.grouper.groupings[0].grouper - grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - def test_grouping_error_on_multidim_input(self): from pandas.core.groupby import Grouping self.assertRaises(ValueError, Grouping, self.df.index, self.df[['A', 'A']]) - def test_agg_python_multiindex(self): - grouped = self.mframe.groupby(['A', 'B']) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - def test_apply_describe_bug(self): grouped = self.mframe.groupby(level='first') grouped.describe() # it works! @@ -1185,80 +1033,6 @@ def test_groups(self): self.assertTrue((self.df.ix[v]['A'] == k[0]).all()) self.assertTrue((self.df.ix[v]['B'] == k[1]).all()) - def test_aggregate_str_func(self): - def _check_results(grouped): - # single series - result = grouped['A'].agg('std') - expected = grouped['A'].std() - assert_series_equal(result, expected) - - # group frame by function name - result = grouped.aggregate('var') - expected = grouped.var() - assert_frame_equal(result, expected) - - # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], - ['C', 'mean'], ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var( - )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) - assert_frame_equal(result, expected) - - by_weekday = self.tsframe.groupby(lambda x: x.weekday()) - _check_results(by_weekday) - - by_mwkday = self.tsframe.groupby([lambda x: x.month, - lambda x: x.weekday()]) - _check_results(by_mwkday) - - def test_aggregate_item_by_item(self): - - df = self.df.copy() - df['E'] = ['a'] * len(self.df) - grouped = self.df.groupby('A') - - # API change in 0.11 - # def aggfun(ser): - # return len(ser + 'a') - # result = grouped.agg(aggfun) - # self.assertEqual(len(result.columns), 1) - - aggfun = lambda ser: ser.size - result = grouped.agg(aggfun) - foo = (self.df.A == 'foo').sum() - bar = (self.df.A == 'bar').sum() - K = len(result.columns) - - # GH5782 - # odd comparisons can result here, so cast to make easy - exp = pd.Series(np.array([foo] * K), index=list('BCD'), - dtype=np.float64, name='foo') - tm.assert_series_equal(result.xs('foo'), exp) - - exp = pd.Series(np.array([bar] * K), index=list('BCD'), - dtype=np.float64, name='bar') - tm.assert_almost_equal(result.xs('bar'), exp) - - def aggfun(ser): - return ser.size - - result = DataFrame().groupby(self.df.A).agg(aggfun) - tm.assertIsInstance(result, DataFrame) - self.assertEqual(len(result), 0) - - def test_agg_item_by_item_raise_typeerror(self): - from numpy.random import randint - - df = DataFrame(randint(10, size=(20, 10))) - - def raiseException(df): - pprint_thing('----------------------------------------') - pprint_thing(df.to_string()) - raise TypeError - - self.assertRaises(TypeError, df.groupby(0).agg, raiseException) - def test_basic_regression(self): # regression T = [1.0 * x for x in lrange(1, 10) * 10][:1095] @@ -1687,34 +1461,6 @@ def test_series_describe_single(self): expected = grouped.describe() assert_series_equal(result, expected) - def test_series_agg_multikey(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - - result = grouped.agg(np.sum) - expected = grouped.sum() - assert_series_equal(result, expected) - - def test_series_agg_multi_pure_python(self): - data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def bad(x): - assert (len(x.base) > 0) - return 'foo' - - result = data.groupby(['A', 'B']).agg(bad) - expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') - assert_frame_equal(result, expected) - def test_series_index_name(self): grouped = self.df.ix[:, ['C']].groupby(self.df['A']) result = grouped.agg(lambda x: x.mean()) @@ -1828,138 +1574,6 @@ def test_frame_set_name_single(self): result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) self.assertEqual(result.index.name, 'A') - def test_aggregate_api_consistency(self): - # GH 9052 - # make sure that the aggregates via dict - # are consistent - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - grouped = df.groupby(['A', 'B']) - c_mean = grouped['C'].mean() - c_sum = grouped['C'].sum() - d_mean = grouped['D'].mean() - d_sum = grouped['D'].sum() - - result = grouped['D'].agg(['sum', 'mean']) - expected = pd.concat([d_sum, d_mean], - axis=1) - expected.columns = ['sum', 'mean'] - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg([np.sum, np.mean]) - expected = pd.concat([c_sum, - c_mean, - d_sum, - d_mean], - axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) - - result = grouped[['D', 'C']].agg([np.sum, np.mean]) - expected = pd.concat([d_sum, - d_mean, - c_sum, - c_mean], - axis=1) - expected.columns = MultiIndex.from_product([['D', 'C'], - ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg({'C': 'mean', 'D': 'sum'}) - expected = pd.concat([d_sum, - c_mean], - axis=1) - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg({'C': ['mean', 'sum'], - 'D': ['mean', 'sum']}) - expected = pd.concat([c_mean, - c_sum, - d_mean, - d_sum], - axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['mean', 'sum']]) - - result = grouped[['D', 'C']].agg({'r': np.sum, - 'r2': np.mean}) - expected = pd.concat([d_sum, - c_sum, - d_mean, - c_mean], - axis=1) - expected.columns = MultiIndex.from_product([['r', 'r2'], - ['D', 'C']]) - assert_frame_equal(result, expected, check_like=True) - - def test_agg_compat(self): - - # GH 12334 - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) - expected.columns = MultiIndex.from_tuples([('C', 'sum'), - ('C', 'std')]) - result = g['D'].agg({'C': ['sum', 'std']}) - assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) - expected.columns = ['C', 'D'] - result = g['D'].agg({'C': 'sum', 'D': 'std'}) - assert_frame_equal(result, expected, check_like=True) - - def test_agg_nested_dicts(self): - - # API change for disallowing these types of nested dicts - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - def f(): - g.aggregate({'r1': {'C': ['mean', 'sum']}, - 'r2': {'D': ['mean', 'sum']}}) - - self.assertRaises(SpecificationError, f) - - result = g.agg({'C': {'ra': ['mean', 'std']}, - 'D': {'rb': ['mean', 'std']}}) - expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), - g['D'].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( - 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - assert_frame_equal(result, expected, check_like=True) - - # same name as the original column - # GH9052 - expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) - expected = expected.rename(columns={'result1': 'D'}) - result = g['D'].agg({'D': np.sum, 'result2': np.mean}) - assert_frame_equal(result, expected, check_like=True) - def test_multi_iter(self): s = Series(np.arange(6)) k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) @@ -3351,51 +2965,6 @@ def filt2(x): result = data.groupby('id_field').apply(filt2) assert_frame_equal(result, expected) - def test_apply_use_categorical_name(self): - from pandas import qcut - cats = qcut(self.df.C, 4) - - def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} - - result = self.df.groupby(cats).D.apply(get_stats) - self.assertEqual(result.index.names[0], 'C') - - def test_apply_categorical_data(self): - # GH 10138 - for ordered in [True, False]: - dense = Categorical(list('abc'), ordered=ordered) - # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) - values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense']) - - # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product( - [Categorical(['a', 'b'], ordered=ordered), - Categorical(['a', 'b', 'c'], ordered=ordered)], - names=['missing', 'dense']) - expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], - index=idx, - columns=['values']) - - assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) - assert_frame_equal(grouped.mean(), expected) - assert_frame_equal(grouped.agg(np.mean), expected) - - # but for transform we should still get back the original index - idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], - names=['missing', 'dense']) - expected = Series(1, index=idx) - assert_series_equal(grouped.apply(lambda x: 1), expected) - def test_apply_corner_cases(self): # #535, can't use sliding iterator @@ -4342,142 +3911,6 @@ def test_groupby_sort_multiindex_series(self): result = mseries.groupby(level=['a', 'b'], sort=True).first() assert_series_equal(result, mseries_result.sort_index()) - def test_groupby_categorical(self): - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) - expected = expected.reindex(exp_idx) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = np.asarray(cats).take(idx) - ord_data = data.take(idx) - - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby(exp_cats, sort=False).describe() - expected.index.names = [None, None] - assert_frame_equal(desc_result, expected) - - # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) - exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) - - def test_groupby_datetime_categorical(self): - # GH9049: ensure backward compatibility - levels = pd.date_range('2014-01-01', periods=4) - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - expected = expected.reindex(levels) - expected.index = CategoricalIndex(expected.index, - categories=expected.index, - ordered=True) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = cats.take_nd(idx) - ord_data = data.take(idx) - expected = ord_data.groupby(ord_labels).describe() - expected.index.names = [None, None] - assert_frame_equal(desc_result, expected) - tm.assert_index_equal(desc_result.index, expected.index) - tm.assert_index_equal( - desc_result.index.get_level_values(0), - expected.index.get_level_values(0)) - - # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) - exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) - - def test_groupby_categorical_index(self): - - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=20) - cats = Categorical.from_codes(codes, levels, ordered=True) - df = DataFrame( - np.repeat( - np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) - df['cats'] = cats - - # with a cat index - result = df.set_index('cats').groupby(level=0).sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - - # with a cat column, should produce a cat index - result = df.groupby('cats').sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - - def test_groupby_describe_categorical_columns(self): - # GH 11558 - cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) - df = DataFrame(np.random.randn(20, 4), columns=cats) - result = df.groupby([1, 2, 3, 4] * 5).describe() - - tm.assert_index_equal(result.columns, cats) - tm.assert_categorical_equal(result.columns.values, cats.values) - - def test_groupby_unstack_categorical(self): - # GH11558 (example is taken from the original issue) - df = pd.DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) - df['medium'] = df['medium'].astype('category') - - gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() - result = gcat.describe() - - exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, - name='medium') - tm.assert_index_equal(result.columns, exp_columns) - tm.assert_categorical_equal(result.columns.values, exp_columns.values) - - result = gcat['A'] + gcat['B'] - expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) - tm.assert_series_equal(result, expected) - def test_groupby_groups_datetimeindex(self): # #1430 from pandas.tseries.api import DatetimeIndex @@ -4695,37 +4128,6 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - def test_groupby_categorical_no_compress(self): - data = Series(np.random.randn(9)) - - codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean() - - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) - data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - - result = data.groupby("b").mean() - result = result["a"].values - exp = np.array([1, 2, 4, np.nan]) - self.assert_numpy_array_equal(result, exp) - def test_groupby_non_arithmetic_agg_types(self): # GH9311, GH6620 df = pd.DataFrame([{'a': 1, @@ -4837,16 +4239,6 @@ def test_groupby_datetime64_32_bit(self): expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') assert_series_equal(result, expected) - def test_groupby_categorical_unequal_len(self): - # GH3011 - series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) - # The raises only happens with categorical, not with series of types - # category - bins = pd.cut(series.dropna().values, 4) - - # len(bins) != len(series) here - self.assertRaises(ValueError, lambda: series.groupby(bins).mean()) - def test_groupby_multiindex_missing_pair(self): # GH9049 df = DataFrame({'group1': ['a', 'a', 'a', 'b'], @@ -5444,534 +4836,6 @@ def test_cumcount_groupby_not_col(self): assert_series_equal(expected, g.cumcount()) assert_series_equal(expected, sg.cumcount()) - def test_filter_series(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(s.index)) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(s.index)) - - def test_filter_single_column_df(self): - df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) - grouper = df[0].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(df.index)) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(df.index)) - - def test_filter_multi_column_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), - expected) - - def test_filter_mixed_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 10), expected) - - def test_filter_out_all_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]]) - - def test_filter_out_no_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - filtered = grouped.filter(lambda x: x.mean() > 0) - assert_series_equal(filtered, s) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - filtered = grouped.filter(lambda x: x['A'].mean() > 0) - assert_frame_equal(filtered, df) - - def test_filter_out_all_groups_in_df(self): - # GH12768 - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) - expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3}) - assert_frame_equal(expected, res) - - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) - expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") - assert_frame_equal(expected, res) - - def test_filter_condition_raises(self): - def raise_if_sum_is_zero(x): - if x.sum() == 0: - raise ValueError - else: - return x.sum() > 0 - - s = pd.Series([-1, 0, 1, 2]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - self.assertRaises(TypeError, - lambda: grouped.filter(raise_if_sum_is_zero)) - - def test_filter_with_axis_in_groupby(self): - # issue 11041 - index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = pd.DataFrame( - np.arange(100).reshape(-1, 20), columns=index, dtype='int64') - result = data.groupby(level=0, - axis=1).filter(lambda x: x.iloc[0, 0] > 10) - expected = data.iloc[:, 12:20] - assert_frame_equal(result, expected) - - def test_filter_bad_shapes(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby('B') - g_s = s.groupby(s) - - f = lambda x: x - self.assertRaises(TypeError, lambda: g_df.filter(f)) - self.assertRaises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: x == 1 - self.assertRaises(TypeError, lambda: g_df.filter(f)) - self.assertRaises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: np.outer(x, x) - self.assertRaises(TypeError, lambda: g_df.filter(f)) - self.assertRaises(TypeError, lambda: g_s.filter(f)) - - def test_filter_nan_is_false(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby(df['B']) - g_s = s.groupby(s) - - f = lambda x: np.nan - assert_frame_equal(g_df.filter(f), df.loc[[]]) - assert_series_equal(g_s.filter(f), s[[]]) - - def test_filter_against_workaround(self): - np.random.seed(0) - # Series of ints - s = Series(np.random.randint(0, 100, 1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Series of floats - s = 100 * Series(np.random.random(1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Set up DataFrame of ints, floats, strings. - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 1000 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), - 'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - - # Group by ints; filter on floats. - grouped = df.groupby('ints') - old_way = df[grouped.floats. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - # Group by floats (rounded); filter on strings. - grouper = df.floats.apply(lambda x: np.round(x, -1)) - grouped = df.groupby(grouper) - old_way = df[grouped.letters. - transform(lambda x: len(x) < N / 10).astype('bool')] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) - assert_frame_equal(new_way, old_way) - - # Group by strings; filter on ints. - grouped = df.groupby('letters') - old_way = df[grouped.ints. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - def test_filter_using_len(self): - # BUG GH4447 - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - grouped = df.groupby('B') - actual = grouped.filter(lambda x: len(x) > 2) - expected = DataFrame( - {'A': np.arange(2, 6), - 'B': list('bbbb'), - 'C': np.arange(2, 6)}, index=np.arange(2, 6)) - assert_frame_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = df.ix[[]] - assert_frame_equal(actual, expected) - - # Series have always worked properly, but we'll test anyway. - s = df['B'] - grouped = s.groupby(s) - actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') - assert_series_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = s[[]] - assert_series_equal(actual, expected) - - def test_filter_maintains_ordering(self): - # Simple case: index is sequential. #4621 - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - def test_filter_multiple_timestamp(self): - # GH 10114 - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], - 'C': Timestamp('20130101')}) - - grouped = df.groupby(['B', 'C']) - - result = grouped['A'].filter(lambda x: True) - assert_series_equal(df['A'], result) - - result = grouped['A'].transform(len) - expected = Series([2, 3, 2, 3, 3], name='A') - assert_series_equal(result, expected) - - result = grouped.filter(lambda x: True) - assert_frame_equal(df, result) - - result = grouped.transform('sum') - expected = DataFrame({'A': [2, 8, 2, 8, 8]}) - assert_frame_equal(result, expected) - - result = grouped.transform(len) - expected = DataFrame({'A': [2, 3, 2, 3, 3]}) - assert_frame_equal(result, expected) - - def test_filter_and_transform_with_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 1, 1, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_multiple_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 0, 0, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_float_index(self): - # GH4620 - index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_timestamp_index(self): - # GH4620 - t0 = Timestamp('2013-09-30 00:05:00') - t1 = Timestamp('2013-10-30 00:05:00') - t2 = Timestamp('2013-11-30 00:05:00') - index = [t1, t1, t1, t2, t1, t1, t0, t1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_string_index(self): - # GH4620 - index = list('bbbcbbab') - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_has_access_to_grouped_cols(self): - df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - # previously didn't have access to col A #???? - filt = g.filter(lambda x: x['A'].sum() == 2) - assert_frame_equal(filt, df.iloc[[0, 1]]) - - def test_filter_enforces_scalarness(self): - df = pd.DataFrame([ - ['best', 'a', 'x'], - ['worst', 'b', 'y'], - ['best', 'c', 'x'], - ['best', 'd', 'y'], - ['worst', 'd', 'y'], - ['worst', 'd', 'y'], - ['best', 'd', 'z'], - ], columns=['a', 'b', 'c']) - with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): - df.groupby('c').filter(lambda g: g['a'] == 'best') - - def test_filter_non_bool_raises(self): - df = pd.DataFrame([ - ['best', 'a', 1], - ['worst', 'b', 1], - ['best', 'c', 1], - ['best', 'd', 1], - ['worst', 'd', 1], - ['worst', 'd', 1], - ['best', 'd', 1], - ], columns=['a', 'b', 'c']) - with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): - df.groupby('a').filter(lambda g: g.c.mean()) - def test_fill_constistency(self): # GH9221 @@ -6687,145 +5551,6 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) - def test_groupby_categorical_two_columns(self): - - # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - - exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", - ordered=True) - exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=exp_index) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) - values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) - tm.assert_frame_equal(res, exp) - - def test_groupby_multi_categorical_as_index(self): - # GH13204 - df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), - 'A': [10, 11, 11], - 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # function grouper - f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # another not in-axis grouper (conflicting names in index) - s = Series(['a', 'b', 'b'], name='cat') - result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # is original index dropped? - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - - for name in [None, 'X', 'B', 'cat']: - df.index = Index(list("abc"), name=name) - result = df.groupby(['cat', 'A'], as_index=False).sum() - tm.assert_frame_equal(result, expected, check_index_type=True) - - def test_groupby_preserve_categorical_dtype(self): - # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) - # single grouper - exp_full = DataFrame({'A': [2.0, 1.0, np.nan], - 'B': [25.0, 20.0, np.nan], - 'C1': Categorical(list("bac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bac"), - categories=list("bac"), - ordered=True)}) - for col in ['C1', 'C2']: - result1 = df.groupby(by=col, as_index=False).mean() - result2 = df.groupby(by=col, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # multiple grouper - exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], - 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, - np.nan], - 'C1': Categorical(list("bacbac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bacbac"), - categories=list("bac"), - ordered=True)}) - for cols in [['A', 'C1'], ['A', 'C2']]: - result1 = df.groupby(by=cols, as_index=False).mean() - result2 = df.groupby(by=cols, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684.