Skip to content

Commit 2121b22

Browse files
committed
add sem to nanops and pandas object apply methods
1 parent ec9a09c commit 2121b22

15 files changed

+178
-30
lines changed

doc/source/api.rst

+4
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ Computations / Descriptive Stats
350350
Series.prod
351351
Series.quantile
352352
Series.rank
353+
Series.sem
353354
Series.skew
354355
Series.std
355356
Series.sum
@@ -642,6 +643,7 @@ Computations / Descriptive Stats
642643
DataFrame.prod
643644
DataFrame.quantile
644645
DataFrame.rank
646+
DataFrame.sem
645647
DataFrame.skew
646648
DataFrame.sum
647649
DataFrame.std
@@ -895,6 +897,7 @@ Computations / Descriptive Stats
895897
Panel.min
896898
Panel.pct_change
897899
Panel.prod
900+
Panel.sem
898901
Panel.skew
899902
Panel.sum
900903
Panel.std
@@ -1222,6 +1225,7 @@ Computations / Descriptive Stats
12221225

12231226
GroupBy.mean
12241227
GroupBy.median
1228+
GroupBy.sem
12251229
GroupBy.std
12261230
GroupBy.var
12271231
GroupBy.ohlc

doc/source/basics.rst

+1
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ optional ``level`` parameter which applies only if the object has a
428428
``prod``, Product of values
429429
``std``, Unbiased standard deviation
430430
``var``, Unbiased variance
431+
``sem``, Unbiased standard error of the mean
431432
``skew``, Unbiased skewness (3rd moment)
432433
``kurt``, Unbiased kurtosis (4th moment)
433434
``quantile``, Sample quantile (value at %)

doc/source/groupby.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ index are the group names and whose values are the sizes of each group.
397397
named *columns*.
398398

399399
Aggregating functions are ones that reduce the dimension of the returned objects,
400-
for example: ``mean, sum, size, count, std, var, describe, first, last, nth, min, max``. This is
400+
for example: ``mean, sum, size, count, std, var, sem, describe, first, last, nth, min, max``. This is
401401
what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``.
402402

403403
``nth`` can act as a reducer *or* a filter, see :ref:`here <groupby.nth>`
@@ -457,7 +457,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching
457457
Cython-optimized aggregation functions
458458
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
459459

460-
Some common aggregations, currently only ``sum``, ``mean``, and ``std``, have
460+
Some common aggregations, currently only ``sum``, ``mean``, ``std``, and ``sem``, have
461461
optimized Cython implementations:
462462

463463
.. ipython:: python

doc/source/timeseries.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -974,8 +974,8 @@ an array and produces aggregated values:
974974
ts.resample('5Min', how=np.max)
975975
976976
Any function available via :ref:`dispatching <groupby.dispatch>` can be given to
977-
the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``max``,
978-
``min``, ``median``, ``first``, ``last``, ``ohlc``.
977+
the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``sem``,
978+
``max``, ``min``, ``median``, ``first``, ``last``, ``ohlc``.
979979

980980
For downsampling, ``closed`` can be set to 'left' or 'right' to specify which
981981
end of the interval is closed:

doc/source/v0.14.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ Enhancements
7373

7474
See :ref:`the docs <timeseries.timezone>`.
7575

76+
- Implemented ``sem`` (standard error of the mean) operation for ``Series``,
77+
``DataFrame``, ``Panel``, and ``Groupby`` (:issue:`6897`)
78+
7679
.. _whatsnew_0141.performance:
7780

7881
Performance

pandas/core/generic.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -3794,7 +3794,8 @@ def mad(self, axis=None, skipna=None, level=None, **kwargs):
37943794

37953795
@Substitution(outname='variance',
37963796
desc="Return unbiased variance over requested "
3797-
"axis\nNormalized by N-1")
3797+
"axis.\n\nNormalized by N-1 by default. "
3798+
"This can be changed using the ddof argument")
37983799
@Appender(_num_doc)
37993800
def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38003801
if skipna is None:
@@ -3811,7 +3812,8 @@ def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38113812

38123813
@Substitution(outname='stdev',
38133814
desc="Return unbiased standard deviation over requested "
3814-
"axis\nNormalized by N-1")
3815+
"axis.\n\nNormalized by N-1 by default. "
3816+
"This can be changed using the ddof argument")
38153817
@Appender(_num_doc)
38163818
def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38173819
if skipna is None:
@@ -3827,6 +3829,24 @@ def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38273829
return np.sqrt(result)
38283830
cls.std = std
38293831

3832+
@Substitution(outname='standarderror',
3833+
desc="Return unbiased standard error of the mean over "
3834+
"requested axis.\n\nNormalized by N-1 by default. "
3835+
"This can be changed using the ddof argument")
3836+
@Appender(_num_doc)
3837+
def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
3838+
if skipna is None:
3839+
skipna = True
3840+
if axis is None:
3841+
axis = self._stat_axis_number
3842+
if level is not None:
3843+
return self._agg_by_level('sem', axis=axis, level=level,
3844+
skipna=skipna, ddof=ddof)
3845+
3846+
return self._reduce(nanops.nansem, axis=axis, skipna=skipna,
3847+
ddof=ddof)
3848+
cls.sem = sem
3849+
38303850
@Substitution(outname='compounded',
38313851
desc="Return the compound percentage of the values for "
38323852
"the requested axis")

pandas/core/groupby.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ def __getattr__(self, attr):
486486
(type(self).__name__, attr))
487487

488488
def __getitem__(self, key):
489-
raise NotImplementedError
489+
raise NotImplementedError('Not implemented: %s' % key)
490490

491491
def _make_wrapper(self, name):
492492
if name not in self._apply_whitelist:
@@ -710,6 +710,14 @@ def var(self, ddof=1):
710710
f = lambda x: x.var(ddof=ddof)
711711
return self._python_agg_general(f)
712712

713+
def sem(self, ddof=1):
714+
"""
715+
Compute standard error of the mean of groups, excluding missing values
716+
717+
For multiple groupings, the result index will be a MultiIndex
718+
"""
719+
return self.std(ddof=ddof)/np.sqrt(self.count())
720+
713721
def size(self):
714722
"""
715723
Compute group sizes

pandas/core/nanops.py

+31-16
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,24 @@ def get_median(x):
308308
return _wrap_results(get_median(values), dtype) if notempty else np.nan
309309

310310

311+
def _get_counts_nanvar(mask, axis, ddof):
312+
count = _get_counts(mask, axis)
313+
314+
d = count-ddof
315+
316+
# always return NaN, never inf
317+
if np.isscalar(count):
318+
if count <= ddof:
319+
count = np.nan
320+
d = np.nan
321+
else:
322+
mask2 = count <= ddof
323+
if mask2.any():
324+
np.putmask(d, mask2, np.nan)
325+
np.putmask(count, mask2, np.nan)
326+
return count, d
327+
328+
311329
@disallow('M8')
312330
@bottleneck_switch(ddof=1)
313331
def nanvar(values, axis=None, skipna=True, ddof=1):
@@ -316,31 +334,28 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
316334

317335
mask = isnull(values)
318336

319-
if axis is not None:
320-
count = (values.shape[axis] - mask.sum(axis)).astype(float)
321-
else:
322-
count = float(values.size - mask.sum())
337+
count, d = _get_counts_nanvar(mask, axis, ddof)
323338

324-
d = count-ddof
325339
if skipna:
326340
values = values.copy()
327341
np.putmask(values, mask, 0)
328342

329-
# always return NaN, never inf
330-
if np.isscalar(count):
331-
if count <= ddof:
332-
count = np.nan
333-
d = np.nan
334-
else:
335-
mask = count <= ddof
336-
if mask.any():
337-
np.putmask(d, mask, np.nan)
338-
np.putmask(count, mask, np.nan)
339-
340343
X = _ensure_numeric(values.sum(axis))
341344
XX = _ensure_numeric((values ** 2).sum(axis))
342345
return np.fabs((XX - X ** 2 / count) / d)
343346

347+
348+
def nansem(values, axis=None, skipna=True, ddof=1):
349+
var = nanvar(values, axis, skipna, ddof=ddof)
350+
351+
if not isinstance(values.dtype.type, np.floating):
352+
values = values.astype('f8')
353+
mask = isnull(values)
354+
count, _ = _get_counts_nanvar(mask, axis, ddof)
355+
356+
return np.sqrt(var)/np.sqrt(count)
357+
358+
344359
@bottleneck_switch()
345360
def nanmin(values, axis=None, skipna=True):
346361
values, mask, dtype, dtype_max = _get_values(values, skipna, fill_value_typ='+inf')

pandas/tests/test_frame.py

+17
Original file line numberDiff line numberDiff line change
@@ -10864,6 +10864,23 @@ def test_var_std(self):
1086410864
self.assertFalse((result < 0).any())
1086510865
nanops._USE_BOTTLENECK = True
1086610866

10867+
def test_sem(self):
10868+
alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
10869+
self._check_stat_op('sem', alt)
10870+
10871+
result = self.tsframe.sem(ddof=4)
10872+
expected = self.tsframe.apply(lambda x: x.std(ddof=4)/np.sqrt(len(x)))
10873+
assert_almost_equal(result, expected)
10874+
10875+
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
10876+
result = nanops.nansem(arr, axis=0)
10877+
self.assertFalse((result < 0).any())
10878+
if nanops._USE_BOTTLENECK:
10879+
nanops._USE_BOTTLENECK = False
10880+
result = nanops.nansem(arr, axis=0)
10881+
self.assertFalse((result < 0).any())
10882+
nanops._USE_BOTTLENECK = True
10883+
1086710884
def test_skew(self):
1086810885
_skip_if_no_scipy()
1086910886
from scipy.stats import skew

pandas/tests/test_groupby.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -681,11 +681,14 @@ def _check_results(grouped):
681681
assert_frame_equal(result, expected)
682682

683683
# group frame by function dict
684-
result = grouped.agg(
685-
OrderedDict([['A', 'var'], ['B', 'std'], ['C', 'mean']]))
684+
result = grouped.agg(OrderedDict([['A', 'var'],
685+
['B', 'std'],
686+
['C', 'mean'],
687+
['D', 'sem']]))
686688
expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
687689
['B', grouped['B'].std()],
688-
['C', grouped['C'].mean()]]))
690+
['C', grouped['C'].mean()],
691+
['D', grouped['D'].sem()]]))
689692
assert_frame_equal(result, expected)
690693

691694
by_weekday = self.tsframe.groupby(lambda x: x.weekday())
@@ -1637,6 +1640,7 @@ def _testit(op):
16371640
_testit(lambda x: x.sum())
16381641
_testit(lambda x: x.std())
16391642
_testit(lambda x: x.var())
1643+
_testit(lambda x: x.sem())
16401644
_testit(lambda x: x.mean())
16411645
_testit(lambda x: x.median())
16421646
_testit(lambda x: x.prod())
@@ -4170,8 +4174,8 @@ def test_tab_completion(self):
41704174
'agg','aggregate','apply','boxplot','filter','first','get_group',
41714175
'groups','hist','indices','last','max','mean','median',
41724176
'min','name','ngroups','nth','ohlc','plot', 'prod',
4173-
'size', 'std', 'sum', 'transform', 'var', 'count', 'head', 'describe',
4174-
'cummax', 'quantile', 'rank', 'cumprod', 'tail',
4177+
'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'head',
4178+
'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail',
41754179
'resample', 'cummin', 'fillna', 'cumsum', 'cumcount',
41764180
'all', 'shift', 'skew', 'bfill', 'irow', 'ffill',
41774181
'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
@@ -4347,6 +4351,12 @@ def test_ops_general(self):
43474351
('last', lambda x: x.iloc[-1]),
43484352
('count', np.size),
43494353
]
4354+
try:
4355+
from scipy.stats import sem
4356+
except ImportError:
4357+
pass
4358+
else:
4359+
ops.append(('sem', sem))
43504360
df = DataFrame(np.random.randn(1000))
43514361
labels = np.random.randint(0, 50, size=1000).astype(float)
43524362

pandas/tests/test_multilevel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1378,7 +1378,7 @@ def test_count(self):
13781378
self.assertRaises(KeyError, frame.count, level='x')
13791379

13801380
AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
1381-
'mad', 'std', 'var']
1381+
'mad', 'std', 'var', 'sem']
13821382

13831383
def test_series_group_min_max(self):
13841384
for op, level, skipna in cart_product(self.AGG_FUNCTIONS,

pandas/tests/test_panel.py

+7
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,13 @@ def alt(x):
127127
return np.std(x, ddof=1)
128128
self._check_stat_op('std', alt)
129129

130+
def test_sem(self):
131+
def alt(x):
132+
if len(x) < 2:
133+
return np.nan
134+
return np.std(x, ddof=1)/np.sqrt(len(x))
135+
self._check_stat_op('sem', alt)
136+
130137
# def test_skew(self):
131138
# from scipy.stats import skew
132139

pandas/tests/test_panel4d.py

+7
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ def alt(x):
9898
return np.std(x, ddof=1)
9999
self._check_stat_op('std', alt)
100100

101+
def test_sem(self):
102+
def alt(x):
103+
if len(x) < 2:
104+
return np.nan
105+
return np.std(x, ddof=1)/np.sqrt(len(x))
106+
self._check_stat_op('sem', alt)
107+
101108
# def test_skew(self):
102109
# from scipy.stats import skew
103110

pandas/tests/test_series.py

+13
Original file line numberDiff line numberDiff line change
@@ -1980,6 +1980,19 @@ def test_var_std(self):
19801980
result = s.std(ddof=1)
19811981
self.assertTrue(isnull(result))
19821982

1983+
def test_sem(self):
1984+
alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
1985+
self._check_stat_op('sem', alt)
1986+
1987+
result = self.ts.sem(ddof=4)
1988+
expected = np.std(self.ts.values, ddof=4)/np.sqrt(len(self.ts.values))
1989+
assert_almost_equal(result, expected)
1990+
1991+
# 1 - element series with ddof=1
1992+
s = self.ts.iloc[[0]]
1993+
result = s.sem(ddof=1)
1994+
self.assert_(isnull(result))
1995+
19831996
def test_skew(self):
19841997
_skip_if_no_scipy()
19851998

0 commit comments

Comments
 (0)