|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +""" test function application """ |
| 4 | + |
| 5 | +import pytest |
| 6 | + |
| 7 | +from string import ascii_lowercase |
| 8 | +from pandas import (date_range, Timestamp, |
| 9 | + Index, MultiIndex, DataFrame, Series) |
| 10 | +from pandas.util.testing import assert_frame_equal, assert_series_equal |
| 11 | +from pandas.compat import product as cart_product |
| 12 | + |
| 13 | +import numpy as np |
| 14 | + |
| 15 | +import pandas.util.testing as tm |
| 16 | +import pandas as pd |
| 17 | +from .common import MixIn |
| 18 | + |
| 19 | + |
| 20 | +# describe |
| 21 | +# -------------------------------- |
| 22 | + |
| 23 | +class TestDescribe(MixIn): |
| 24 | + |
| 25 | + def test_apply_describe_bug(self): |
| 26 | + grouped = self.mframe.groupby(level='first') |
| 27 | + grouped.describe() # it works! |
| 28 | + |
| 29 | + def test_series_describe_multikey(self): |
| 30 | + ts = tm.makeTimeSeries() |
| 31 | + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) |
| 32 | + result = grouped.describe() |
| 33 | + assert_series_equal(result['mean'], grouped.mean(), check_names=False) |
| 34 | + assert_series_equal(result['std'], grouped.std(), check_names=False) |
| 35 | + assert_series_equal(result['min'], grouped.min(), check_names=False) |
| 36 | + |
| 37 | + def test_series_describe_single(self): |
| 38 | + ts = tm.makeTimeSeries() |
| 39 | + grouped = ts.groupby(lambda x: x.month) |
| 40 | + result = grouped.apply(lambda x: x.describe()) |
| 41 | + expected = grouped.describe().stack() |
| 42 | + assert_series_equal(result, expected) |
| 43 | + |
| 44 | + def test_series_index_name(self): |
| 45 | + grouped = self.df.loc[:, ['C']].groupby(self.df['A']) |
| 46 | + result = grouped.agg(lambda x: x.mean()) |
| 47 | + assert result.index.name == 'A' |
| 48 | + |
| 49 | + def test_frame_describe_multikey(self): |
| 50 | + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) |
| 51 | + result = grouped.describe() |
| 52 | + desc_groups = [] |
| 53 | + for col in self.tsframe: |
| 54 | + group = grouped[col].describe() |
| 55 | + group_col = pd.MultiIndex([[col] * len(group.columns), |
| 56 | + group.columns], |
| 57 | + [[0] * len(group.columns), |
| 58 | + range(len(group.columns))]) |
| 59 | + group = pd.DataFrame(group.values, |
| 60 | + columns=group_col, |
| 61 | + index=group.index) |
| 62 | + desc_groups.append(group) |
| 63 | + expected = pd.concat(desc_groups, axis=1) |
| 64 | + tm.assert_frame_equal(result, expected) |
| 65 | + |
| 66 | + groupedT = self.tsframe.groupby({'A': 0, 'B': 0, |
| 67 | + 'C': 1, 'D': 1}, axis=1) |
| 68 | + result = groupedT.describe() |
| 69 | + expected = self.tsframe.describe().T |
| 70 | + expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], |
| 71 | + [range(4), range(len(expected.index))]) |
| 72 | + tm.assert_frame_equal(result, expected) |
| 73 | + |
| 74 | + def test_frame_describe_tupleindex(self): |
| 75 | + |
| 76 | + # GH 14848 - regression from 0.19.0 to 0.19.1 |
| 77 | + df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, |
| 78 | + 'y': [10, 20, 30, 40, 50] * 3, |
| 79 | + 'z': [100, 200, 300, 400, 500] * 3}) |
| 80 | + df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 |
| 81 | + df2 = df1.rename(columns={'k': 'key'}) |
| 82 | + pytest.raises(ValueError, lambda: df1.groupby('k').describe()) |
| 83 | + pytest.raises(ValueError, lambda: df2.groupby('key').describe()) |
| 84 | + |
| 85 | + def test_frame_describe_unstacked_format(self): |
| 86 | + # GH 4792 |
| 87 | + prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, |
| 88 | + pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, |
| 89 | + pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} |
| 90 | + volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, |
| 91 | + pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, |
| 92 | + pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} |
| 93 | + df = pd.DataFrame({'PRICE': prices, |
| 94 | + 'VOLUME': volumes}) |
| 95 | + result = df.groupby('PRICE').VOLUME.describe() |
| 96 | + data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), |
| 97 | + df[df.PRICE == 25499].VOLUME.describe().values.tolist()] |
| 98 | + expected = pd.DataFrame(data, |
| 99 | + index=pd.Index([24990, 25499], name='PRICE'), |
| 100 | + columns=['count', 'mean', 'std', 'min', |
| 101 | + '25%', '50%', '75%', 'max']) |
| 102 | + tm.assert_frame_equal(result, expected) |
| 103 | + |
| 104 | + |
| 105 | +# nunique |
| 106 | +# -------------------------------- |
| 107 | + |
| 108 | +class TestNUnique(MixIn): |
| 109 | + |
| 110 | + def test_series_groupby_nunique(self): |
| 111 | + |
| 112 | + def check_nunique(df, keys, as_index=True): |
| 113 | + for sort, dropna in cart_product((False, True), repeat=2): |
| 114 | + gr = df.groupby(keys, as_index=as_index, sort=sort) |
| 115 | + left = gr['julie'].nunique(dropna=dropna) |
| 116 | + |
| 117 | + gr = df.groupby(keys, as_index=as_index, sort=sort) |
| 118 | + right = gr['julie'].apply(Series.nunique, dropna=dropna) |
| 119 | + if not as_index: |
| 120 | + right = right.reset_index(drop=True) |
| 121 | + |
| 122 | + assert_series_equal(left, right, check_names=False) |
| 123 | + |
| 124 | + days = date_range('2015-08-23', periods=10) |
| 125 | + |
| 126 | + for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): |
| 127 | + frame = DataFrame({ |
| 128 | + 'jim': np.random.choice( |
| 129 | + list(ascii_lowercase), n), |
| 130 | + 'joe': np.random.choice(days, n), |
| 131 | + 'julie': np.random.randint(0, m, n) |
| 132 | + }) |
| 133 | + |
| 134 | + check_nunique(frame, ['jim']) |
| 135 | + check_nunique(frame, ['jim', 'joe']) |
| 136 | + |
| 137 | + frame.loc[1::17, 'jim'] = None |
| 138 | + frame.loc[3::37, 'joe'] = None |
| 139 | + frame.loc[7::19, 'julie'] = None |
| 140 | + frame.loc[8::19, 'julie'] = None |
| 141 | + frame.loc[9::19, 'julie'] = None |
| 142 | + |
| 143 | + check_nunique(frame, ['jim']) |
| 144 | + check_nunique(frame, ['jim', 'joe']) |
| 145 | + check_nunique(frame, ['jim'], as_index=False) |
| 146 | + check_nunique(frame, ['jim', 'joe'], as_index=False) |
| 147 | + |
| 148 | + def test_nunique(self): |
| 149 | + df = DataFrame({ |
| 150 | + 'A': list('abbacc'), |
| 151 | + 'B': list('abxacc'), |
| 152 | + 'C': list('abbacx'), |
| 153 | + }) |
| 154 | + |
| 155 | + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) |
| 156 | + result = df.groupby('A', as_index=False).nunique() |
| 157 | + tm.assert_frame_equal(result, expected) |
| 158 | + |
| 159 | + # as_index |
| 160 | + expected.index = list('abc') |
| 161 | + expected.index.name = 'A' |
| 162 | + result = df.groupby('A').nunique() |
| 163 | + tm.assert_frame_equal(result, expected) |
| 164 | + |
| 165 | + # with na |
| 166 | + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) |
| 167 | + tm.assert_frame_equal(result, expected) |
| 168 | + |
| 169 | + # dropna |
| 170 | + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, |
| 171 | + index=list('abc')) |
| 172 | + expected.index.name = 'A' |
| 173 | + result = df.replace({'x': None}).groupby('A').nunique() |
| 174 | + tm.assert_frame_equal(result, expected) |
| 175 | + |
| 176 | + def test_nunique_with_object(self): |
| 177 | + # GH 11077 |
| 178 | + data = pd.DataFrame( |
| 179 | + [[100, 1, 'Alice'], |
| 180 | + [200, 2, 'Bob'], |
| 181 | + [300, 3, 'Charlie'], |
| 182 | + [-400, 4, 'Dan'], |
| 183 | + [500, 5, 'Edith']], |
| 184 | + columns=['amount', 'id', 'name'] |
| 185 | + ) |
| 186 | + |
| 187 | + result = data.groupby(['id', 'amount'])['name'].nunique() |
| 188 | + index = MultiIndex.from_arrays([data.id, data.amount]) |
| 189 | + expected = pd.Series([1] * 5, name='name', index=index) |
| 190 | + tm.assert_series_equal(result, expected) |
| 191 | + |
| 192 | + def test_nunique_with_empty_series(self): |
| 193 | + # GH 12553 |
| 194 | + data = pd.Series(name='name') |
| 195 | + result = data.groupby(level=0).nunique() |
| 196 | + expected = pd.Series(name='name', dtype='int64') |
| 197 | + tm.assert_series_equal(result, expected) |
| 198 | + |
| 199 | + def test_nunique_with_timegrouper(self): |
| 200 | + # GH 13453 |
| 201 | + test = pd.DataFrame({ |
| 202 | + 'time': [Timestamp('2016-06-28 09:35:35'), |
| 203 | + Timestamp('2016-06-28 16:09:30'), |
| 204 | + Timestamp('2016-06-28 16:46:28')], |
| 205 | + 'data': ['1', '2', '3']}).set_index('time') |
| 206 | + result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() |
| 207 | + expected = test.groupby( |
| 208 | + pd.Grouper(freq='h') |
| 209 | + )['data'].apply(pd.Series.nunique) |
| 210 | + tm.assert_series_equal(result, expected) |
| 211 | + |
| 212 | + |
| 213 | +# count |
| 214 | +# -------------------------------- |
| 215 | + |
| 216 | +class TestCount(MixIn): |
| 217 | + |
| 218 | + def test_groupby_timedelta_cython_count(self): |
| 219 | + df = DataFrame({'g': list('ab' * 2), |
| 220 | + 'delt': np.arange(4).astype('timedelta64[ns]')}) |
| 221 | + expected = Series([ |
| 222 | + 2, 2 |
| 223 | + ], index=pd.Index(['a', 'b'], name='g'), name='delt') |
| 224 | + result = df.groupby('g').delt.count() |
| 225 | + tm.assert_series_equal(expected, result) |
| 226 | + |
| 227 | + def test_count(self): |
| 228 | + n = 1 << 15 |
| 229 | + dr = date_range('2015-08-30', periods=n // 10, freq='T') |
| 230 | + |
| 231 | + df = DataFrame({ |
| 232 | + '1st': np.random.choice( |
| 233 | + list(ascii_lowercase), n), |
| 234 | + '2nd': np.random.randint(0, 5, n), |
| 235 | + '3rd': np.random.randn(n).round(3), |
| 236 | + '4th': np.random.randint(-10, 10, n), |
| 237 | + '5th': np.random.choice(dr, n), |
| 238 | + '6th': np.random.randn(n).round(3), |
| 239 | + '7th': np.random.randn(n).round(3), |
| 240 | + '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), |
| 241 | + '9th': np.random.choice( |
| 242 | + list(ascii_lowercase), n) |
| 243 | + }) |
| 244 | + |
| 245 | + for col in df.columns.drop(['1st', '2nd', '4th']): |
| 246 | + df.loc[np.random.choice(n, n // 10), col] = np.nan |
| 247 | + |
| 248 | + df['9th'] = df['9th'].astype('category') |
| 249 | + |
| 250 | + for key in '1st', '2nd', ['1st', '2nd']: |
| 251 | + left = df.groupby(key).count() |
| 252 | + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) |
| 253 | + assert_frame_equal(left, right) |
| 254 | + |
| 255 | + # GH5610 |
| 256 | + # count counts non-nulls |
| 257 | + df = pd.DataFrame([[1, 2, 'foo'], |
| 258 | + [1, np.nan, 'bar'], |
| 259 | + [3, np.nan, np.nan]], |
| 260 | + columns=['A', 'B', 'C']) |
| 261 | + |
| 262 | + count_as = df.groupby('A').count() |
| 263 | + count_not_as = df.groupby('A', as_index=False).count() |
| 264 | + |
| 265 | + expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], |
| 266 | + index=[1, 3]) |
| 267 | + expected.index.name = 'A' |
| 268 | + assert_frame_equal(count_not_as, expected.reset_index()) |
| 269 | + assert_frame_equal(count_as, expected) |
| 270 | + |
| 271 | + count_B = df.groupby('A')['B'].count() |
| 272 | + assert_series_equal(count_B, expected['B']) |
| 273 | + |
| 274 | + def test_count_object(self): |
| 275 | + df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) |
| 276 | + result = df.groupby('c').a.count() |
| 277 | + expected = pd.Series([ |
| 278 | + 3, 3 |
| 279 | + ], index=pd.Index([2, 3], name='c'), name='a') |
| 280 | + tm.assert_series_equal(result, expected) |
| 281 | + |
| 282 | + df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, |
| 283 | + 'c': [2] * 3 + [3] * 3}) |
| 284 | + result = df.groupby('c').a.count() |
| 285 | + expected = pd.Series([ |
| 286 | + 1, 3 |
| 287 | + ], index=pd.Index([2, 3], name='c'), name='a') |
| 288 | + tm.assert_series_equal(result, expected) |
| 289 | + |
| 290 | + def test_count_cross_type(self): # GH8169 |
| 291 | + vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( |
| 292 | + 0, 2, (100, 2)))) |
| 293 | + |
| 294 | + df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) |
| 295 | + df[df == 2] = np.nan |
| 296 | + expected = df.groupby(['c', 'd']).count() |
| 297 | + |
| 298 | + for t in ['float32', 'object']: |
| 299 | + df['a'] = df['a'].astype(t) |
| 300 | + df['b'] = df['b'].astype(t) |
| 301 | + result = df.groupby(['c', 'd']).count() |
| 302 | + tm.assert_frame_equal(result, expected) |
| 303 | + |
| 304 | + def test_lower_int_prec_count(self): |
| 305 | + df = DataFrame({'a': np.array( |
| 306 | + [0, 1, 2, 100], np.int8), |
| 307 | + 'b': np.array( |
| 308 | + [1, 2, 3, 6], np.uint32), |
| 309 | + 'c': np.array( |
| 310 | + [4, 5, 6, 8], np.int16), |
| 311 | + 'grp': list('ab' * 2)}) |
| 312 | + result = df.groupby('grp').count() |
| 313 | + expected = DataFrame({'a': [2, 2], |
| 314 | + 'b': [2, 2], |
| 315 | + 'c': [2, 2]}, index=pd.Index(list('ab'), |
| 316 | + name='grp')) |
| 317 | + tm.assert_frame_equal(result, expected) |
| 318 | + |
| 319 | + def test_count_uses_size_on_exception(self): |
| 320 | + class RaisingObjectException(Exception): |
| 321 | + pass |
| 322 | + |
| 323 | + class RaisingObject(object): |
| 324 | + |
| 325 | + def __init__(self, msg='I will raise inside Cython'): |
| 326 | + super(RaisingObject, self).__init__() |
| 327 | + self.msg = msg |
| 328 | + |
| 329 | + def __eq__(self, other): |
| 330 | + # gets called in Cython to check that raising calls the method |
| 331 | + raise RaisingObjectException(self.msg) |
| 332 | + |
| 333 | + df = DataFrame({'a': [RaisingObject() for _ in range(4)], |
| 334 | + 'grp': list('ab' * 2)}) |
| 335 | + result = df.groupby('grp').count() |
| 336 | + expected = DataFrame({'a': [2, 2]}, index=pd.Index( |
| 337 | + list('ab'), name='grp')) |
| 338 | + tm.assert_frame_equal(result, expected) |
| 339 | + |
| 340 | + |
| 341 | +# size |
| 342 | +# -------------------------------- |
| 343 | + |
| 344 | +class TestSize(MixIn): |
| 345 | + |
| 346 | + def test_size(self): |
| 347 | + grouped = self.df.groupby(['A', 'B']) |
| 348 | + result = grouped.size() |
| 349 | + for key, group in grouped: |
| 350 | + assert result[key] == len(group) |
| 351 | + |
| 352 | + grouped = self.df.groupby('A') |
| 353 | + result = grouped.size() |
| 354 | + for key, group in grouped: |
| 355 | + assert result[key] == len(group) |
| 356 | + |
| 357 | + grouped = self.df.groupby('B') |
| 358 | + result = grouped.size() |
| 359 | + for key, group in grouped: |
| 360 | + assert result[key] == len(group) |
| 361 | + |
| 362 | + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) |
| 363 | + for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): |
| 364 | + left = df.groupby(key, sort=sort).size() |
| 365 | + right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) |
| 366 | + assert_series_equal(left, right, check_names=False) |
| 367 | + |
| 368 | + # GH11699 |
| 369 | + df = DataFrame([], columns=['A', 'B']) |
| 370 | + out = Series([], dtype='int64', index=Index([], name='A')) |
| 371 | + assert_series_equal(df.groupby('A').size(), out) |
0 commit comments