Skip to content

Commit 231d441

Browse files
committed
TST: groupby.describe levels don't appear as column (#4792)
Restructure describe def Fix another test Refactoring tests linting & patch groupby tests add whatsnew fix docstring
1 parent c26e5bb commit 231d441

File tree

3 files changed

+75
-23
lines changed

3 files changed

+75
-23
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ Other API Changes
366366
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
367367
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
368368
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
369+
- ``groupby.describe()`` now labels the `describe()` metrics in the column instead of the index (:issue:`4792`)
369370
.. _whatsnew_0200.deprecations:
370371

371372
Deprecations

pandas/core/groupby.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@
8080
'mean', 'sum', 'min', 'max',
8181
'cumcount',
8282
'resample',
83-
'describe',
8483
'rank', 'quantile',
8584
'fillna',
8685
'mad',
@@ -1138,6 +1137,17 @@ def ohlc(self):
11381137
return self._apply_to_column_groupbys(
11391138
lambda x: x._cython_agg_general('ohlc'))
11401139

1140+
@Substitution(name='groupby')
1141+
@Appender(_doc_template)
1142+
def describe(self, **kwargs):
1143+
"""
1144+
Provide summary statistics for each group, excluding NaN values
1145+
"""
1146+
result = self.apply(lambda x: x.describe(**kwargs))
1147+
if self.axis == 1:
1148+
return result.T
1149+
return result.unstack()
1150+
11411151
@Substitution(name='groupby')
11421152
@Appender(_doc_template)
11431153
def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3049,13 @@ def nlargest(self, n=5, keep='first'):
30393049
def nsmallest(self, n=5, keep='first'):
30403050
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
30413051

3052+
@Appender(Series.describe.__doc__)
3053+
def describe(self, **kwargs):
3054+
result = self.apply(lambda x: x.describe(**kwargs))
3055+
if self.axis == 1:
3056+
return result.T
3057+
return result.unstack()
3058+
30423059
def value_counts(self, normalize=False, sort=True, ascending=False,
30433060
bins=None, dropna=True):
30443061

pandas/tests/groupby/test_groupby.py

+56-22
Original file line numberDiff line numberDiff line change
@@ -1447,7 +1447,7 @@ def test_attr_wrapper(self):
14471447
for name, gp in grouped:
14481448
expected[name] = gp.describe()
14491449
expected = DataFrame(expected).T
1450-
assert_frame_equal(result.unstack(), expected)
1450+
assert_frame_equal(result, expected)
14511451

14521452
# get attribute
14531453
result = grouped.dtype
@@ -1459,7 +1459,7 @@ def test_attr_wrapper(self):
14591459
def test_series_describe_multikey(self):
14601460
ts = tm.makeTimeSeries()
14611461
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
1462-
result = grouped.describe().unstack()
1462+
result = grouped.describe()
14631463
assert_series_equal(result['mean'], grouped.mean(), check_names=False)
14641464
assert_series_equal(result['std'], grouped.std(), check_names=False)
14651465
assert_series_equal(result['min'], grouped.min(), check_names=False)
@@ -1468,7 +1468,7 @@ def test_series_describe_single(self):
14681468
ts = tm.makeTimeSeries()
14691469
grouped = ts.groupby(lambda x: x.month)
14701470
result = grouped.apply(lambda x: x.describe())
1471-
expected = grouped.describe()
1471+
expected = grouped.describe().stack()
14721472
assert_series_equal(result, expected)
14731473

14741474
def test_series_index_name(self):
@@ -1479,17 +1479,27 @@ def test_series_index_name(self):
14791479
def test_frame_describe_multikey(self):
14801480
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
14811481
result = grouped.describe()
1482-
1482+
desc_groups = []
14831483
for col in self.tsframe:
1484-
expected = grouped[col].describe()
1485-
assert_series_equal(result[col], expected, check_names=False)
1484+
group = grouped[col].describe()
1485+
group_col = pd.MultiIndex([[col] * len(group.columns),
1486+
group.columns],
1487+
[[0] * len(group.columns),
1488+
range(len(group.columns))])
1489+
group = pd.DataFrame(group.values,
1490+
columns=group_col,
1491+
index=group.index)
1492+
desc_groups.append(group)
1493+
expected = pd.concat(desc_groups, axis=1)
1494+
tm.assert_frame_equal(result, expected)
14861495

14871496
groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
14881497
'C': 1, 'D': 1}, axis=1)
14891498
result = groupedT.describe()
1490-
1491-
for name, group in groupedT:
1492-
assert_frame_equal(result[name], group.describe())
1499+
expected = self.tsframe.describe().T
1500+
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
1501+
[range(4), range(len(expected.index))])
1502+
tm.assert_frame_equal(result, expected)
14931503

14941504
def test_frame_describe_tupleindex(self):
14951505

@@ -1499,10 +1509,27 @@ def test_frame_describe_tupleindex(self):
14991509
'z': [100, 200, 300, 400, 500] * 3})
15001510
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
15011511
df2 = df1.rename(columns={'k': 'key'})
1502-
result = df1.groupby('k').describe()
1503-
expected = df2.groupby('key').describe()
1504-
expected.index.set_names(result.index.names, inplace=True)
1505-
assert_frame_equal(result, expected)
1512+
tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
1513+
tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())
1514+
1515+
def test_frame_describe_multiindex_level_not_as_column(self):
1516+
# GH 4792
1517+
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
1518+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
1519+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
1520+
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
1521+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
1522+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
1523+
df = pd.DataFrame({'PRICE': prices,
1524+
'VOLUME': volumes})
1525+
result = df.groupby('PRICE').VOLUME.describe()
1526+
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
1527+
df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
1528+
expected = pd.DataFrame(data,
1529+
index=pd.Index([24990, 25499], name='PRICE'),
1530+
columns=['count', 'mean', 'std', 'min',
1531+
'25%', '50%', '75%', 'max'])
1532+
tm.assert_frame_equal(result, expected)
15061533

15071534
def test_frame_groupby(self):
15081535
grouped = self.tsframe.groupby(lambda x: x.weekday())
@@ -2994,16 +3021,25 @@ def test_non_cython_api(self):
29943021
assert_frame_equal(result, expected)
29953022

29963023
# describe
2997-
expected = DataFrame(dict(B=concat(
2998-
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
2999-
keys=[1, 3])))
3000-
expected.index.names = ['A', None]
3024+
expected = pd.concat([(df[df.A == 1].B
3025+
.describe()
3026+
.to_frame()
3027+
.unstack()
3028+
.to_frame()
3029+
.T),
3030+
(df[df.A == 3].B
3031+
.describe()
3032+
.to_frame()
3033+
.unstack()
3034+
.to_frame()
3035+
.T)])
3036+
expected.index = pd.Index([1, 3], name='A')
30013037
result = g.describe()
30023038
assert_frame_equal(result, expected)
30033039

3004-
expected = concat(
3005-
[df.loc[[0, 1], ['A', 'B']].describe(),
3006-
df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
3040+
expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
3041+
df[df.A == 3].describe().unstack().to_frame().T])
3042+
expected.index = pd.Index([0, 1])
30073043
result = gni.describe()
30083044
assert_frame_equal(result, expected)
30093045

@@ -5149,7 +5185,6 @@ def test_groupby_whitelist(self):
51495185
'tail',
51505186
'cumcount',
51515187
'resample',
5152-
'describe',
51535188
'rank',
51545189
'quantile',
51555190
'fillna',
@@ -5186,7 +5221,6 @@ def test_groupby_whitelist(self):
51865221
'tail',
51875222
'cumcount',
51885223
'resample',
5189-
'describe',
51905224
'rank',
51915225
'quantile',
51925226
'fillna',

0 commit comments

Comments
 (0)