Skip to content

Commit 8fed790

Browse files
committed
Merge pull request #7133 from toddrjen/sem
ENH: Implement "standard error of the mean"
2 parents 4fed1e0 + 2121b22 commit 8fed790

15 files changed

+234
-46
lines changed

doc/source/api.rst

+4
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ Computations / Descriptive Stats
350350
Series.prod
351351
Series.quantile
352352
Series.rank
353+
Series.sem
353354
Series.skew
354355
Series.std
355356
Series.sum
@@ -642,6 +643,7 @@ Computations / Descriptive Stats
642643
DataFrame.prod
643644
DataFrame.quantile
644645
DataFrame.rank
646+
DataFrame.sem
645647
DataFrame.skew
646648
DataFrame.sum
647649
DataFrame.std
@@ -895,6 +897,7 @@ Computations / Descriptive Stats
895897
Panel.min
896898
Panel.pct_change
897899
Panel.prod
900+
Panel.sem
898901
Panel.skew
899902
Panel.sum
900903
Panel.std
@@ -1222,6 +1225,7 @@ Computations / Descriptive Stats
12221225

12231226
GroupBy.mean
12241227
GroupBy.median
1228+
GroupBy.sem
12251229
GroupBy.std
12261230
GroupBy.var
12271231
GroupBy.ohlc

doc/source/basics.rst

+1
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ optional ``level`` parameter which applies only if the object has a
428428
``prod``, Product of values
429429
``std``, Unbiased standard deviation
430430
``var``, Unbiased variance
431+
``sem``, Unbiased standard error of the mean
431432
``skew``, Unbiased skewness (3rd moment)
432433
``kurt``, Unbiased kurtosis (4th moment)
433434
``quantile``, Sample quantile (value at %)

doc/source/groupby.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ index are the group names and whose values are the sizes of each group.
397397
named *columns*.
398398

399399
Aggregating functions are ones that reduce the dimension of the returned objects,
400-
for example: ``mean, sum, size, count, std, var, describe, first, last, nth, min, max``. This is
400+
for example: ``mean, sum, size, count, std, var, sem, describe, first, last, nth, min, max``. This is
401401
what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``.
402402

403403
``nth`` can act as a reducer *or* a filter, see :ref:`here <groupby.nth>`
@@ -457,7 +457,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching
457457
Cython-optimized aggregation functions
458458
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
459459

460-
Some common aggregations, currently only ``sum``, ``mean``, and ``std``, have
460+
Some common aggregations, currently only ``sum``, ``mean``, ``std``, and ``sem``, have
461461
optimized Cython implementations:
462462

463463
.. ipython:: python

doc/source/timeseries.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -974,8 +974,8 @@ an array and produces aggregated values:
974974
ts.resample('5Min', how=np.max)
975975
976976
Any function available via :ref:`dispatching <groupby.dispatch>` can be given to
977-
the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``max``,
978-
``min``, ``median``, ``first``, ``last``, ``ohlc``.
977+
the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``sem``,
978+
``max``, ``min``, ``median``, ``first``, ``last``, ``ohlc``.
979979

980980
For downsampling, ``closed`` can be set to 'left' or 'right' to specify which
981981
end of the interval is closed:

doc/source/v0.14.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ Enhancements
7373

7474
See :ref:`the docs <timeseries.timezone>`.
7575

76+
- Implemented ``sem`` (standard error of the mean) operation for ``Series``,
77+
``DataFrame``, ``Panel``, and ``Groupby`` (:issue:`6897`)
78+
7679
.. _whatsnew_0141.performance:
7780

7881
Performance

pandas/core/generic.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -3794,7 +3794,8 @@ def mad(self, axis=None, skipna=None, level=None, **kwargs):
37943794

37953795
@Substitution(outname='variance',
37963796
desc="Return unbiased variance over requested "
3797-
"axis\nNormalized by N-1")
3797+
"axis.\n\nNormalized by N-1 by default. "
3798+
"This can be changed using the ddof argument")
37983799
@Appender(_num_doc)
37993800
def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38003801
if skipna is None:
@@ -3811,7 +3812,8 @@ def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38113812

38123813
@Substitution(outname='stdev',
38133814
desc="Return unbiased standard deviation over requested "
3814-
"axis\nNormalized by N-1")
3815+
"axis.\n\nNormalized by N-1 by default. "
3816+
"This can be changed using the ddof argument")
38153817
@Appender(_num_doc)
38163818
def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38173819
if skipna is None:
@@ -3827,6 +3829,24 @@ def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
38273829
return np.sqrt(result)
38283830
cls.std = std
38293831

3832+
@Substitution(outname='standarderror',
3833+
desc="Return unbiased standard error of the mean over "
3834+
"requested axis.\n\nNormalized by N-1 by default. "
3835+
"This can be changed using the ddof argument")
3836+
@Appender(_num_doc)
3837+
def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
3838+
if skipna is None:
3839+
skipna = True
3840+
if axis is None:
3841+
axis = self._stat_axis_number
3842+
if level is not None:
3843+
return self._agg_by_level('sem', axis=axis, level=level,
3844+
skipna=skipna, ddof=ddof)
3845+
3846+
return self._reduce(nanops.nansem, axis=axis, skipna=skipna,
3847+
ddof=ddof)
3848+
cls.sem = sem
3849+
38303850
@Substitution(outname='compounded',
38313851
desc="Return the compound percentage of the values for "
38323852
"the requested axis")

pandas/core/groupby.py

+12-17
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ def __getattr__(self, attr):
486486
(type(self).__name__, attr))
487487

488488
def __getitem__(self, key):
489-
raise NotImplementedError
489+
raise NotImplementedError('Not implemented: %s' % key)
490490

491491
def _make_wrapper(self, name):
492492
if name not in self._apply_whitelist:
@@ -695,12 +695,7 @@ def std(self, ddof=1):
695695
For multiple groupings, the result index will be a MultiIndex
696696
"""
697697
# todo, implement at cython level?
698-
if ddof == 1:
699-
return self._cython_agg_general('std')
700-
else:
701-
self._set_selection_from_grouper()
702-
f = lambda x: x.std(ddof=ddof)
703-
return self._python_agg_general(f)
698+
return np.sqrt(self.var(ddof=ddof))
704699

705700
def var(self, ddof=1):
706701
"""
@@ -715,6 +710,14 @@ def var(self, ddof=1):
715710
f = lambda x: x.var(ddof=ddof)
716711
return self._python_agg_general(f)
717712

713+
def sem(self, ddof=1):
714+
"""
715+
Compute standard error of the mean of groups, excluding missing values
716+
717+
For multiple groupings, the result index will be a MultiIndex
718+
"""
719+
return self.std(ddof=ddof)/np.sqrt(self.count())
720+
718721
def size(self):
719722
"""
720723
Compute group sizes
@@ -1332,7 +1335,6 @@ def get_group_levels(self):
13321335
'name': 'group_median'
13331336
},
13341337
'var': 'group_var',
1335-
'std': 'group_var',
13361338
'first': {
13371339
'name': 'group_nth',
13381340
'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
@@ -1341,10 +1343,6 @@ def get_group_levels(self):
13411343
'count': 'group_count',
13421344
}
13431345

1344-
_cython_transforms = {
1345-
'std': np.sqrt,
1346-
}
1347-
13481346
_cython_arity = {
13491347
'ohlc': 4, # OHLC
13501348
}
@@ -1455,7 +1453,6 @@ def aggregate(self, values, how, axis=0):
14551453

14561454
def _aggregate(self, result, counts, values, how, is_numeric):
14571455
agg_func, dtype = self._get_aggregate_function(how, values)
1458-
trans_func = self._cython_transforms.get(how, lambda x: x)
14591456

14601457
comp_ids, _, ngroups = self.group_info
14611458
if values.ndim > 3:
@@ -1469,7 +1466,7 @@ def _aggregate(self, result, counts, values, how, is_numeric):
14691466
else:
14701467
agg_func(result, counts, values, comp_ids)
14711468

1472-
return trans_func(result)
1469+
return result
14731470

14741471
def agg_series(self, obj, func):
14751472
try:
@@ -1669,7 +1666,6 @@ def names(self):
16691666
'min': 'group_min_bin',
16701667
'max': 'group_max_bin',
16711668
'var': 'group_var_bin',
1672-
'std': 'group_var_bin',
16731669
'ohlc': 'group_ohlc',
16741670
'first': {
16751671
'name': 'group_nth_bin',
@@ -1688,7 +1684,6 @@ def names(self):
16881684
def _aggregate(self, result, counts, values, how, is_numeric=True):
16891685

16901686
agg_func, dtype = self._get_aggregate_function(how, values)
1691-
trans_func = self._cython_transforms.get(how, lambda x: x)
16921687

16931688
if values.ndim > 3:
16941689
# punting for now
@@ -1699,7 +1694,7 @@ def _aggregate(self, result, counts, values, how, is_numeric=True):
16991694
else:
17001695
agg_func(result, counts, values, self.bins)
17011696

1702-
return trans_func(result)
1697+
return result
17031698

17041699
def agg_series(self, obj, func):
17051700
dummy = obj[:0]

pandas/core/nanops.py

+31-16
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,24 @@ def get_median(x):
308308
return _wrap_results(get_median(values), dtype) if notempty else np.nan
309309

310310

311+
def _get_counts_nanvar(mask, axis, ddof):
312+
count = _get_counts(mask, axis)
313+
314+
d = count-ddof
315+
316+
# always return NaN, never inf
317+
if np.isscalar(count):
318+
if count <= ddof:
319+
count = np.nan
320+
d = np.nan
321+
else:
322+
mask2 = count <= ddof
323+
if mask2.any():
324+
np.putmask(d, mask2, np.nan)
325+
np.putmask(count, mask2, np.nan)
326+
return count, d
327+
328+
311329
@disallow('M8')
312330
@bottleneck_switch(ddof=1)
313331
def nanvar(values, axis=None, skipna=True, ddof=1):
@@ -316,31 +334,28 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
316334

317335
mask = isnull(values)
318336

319-
if axis is not None:
320-
count = (values.shape[axis] - mask.sum(axis)).astype(float)
321-
else:
322-
count = float(values.size - mask.sum())
337+
count, d = _get_counts_nanvar(mask, axis, ddof)
323338

324-
d = count-ddof
325339
if skipna:
326340
values = values.copy()
327341
np.putmask(values, mask, 0)
328342

329-
# always return NaN, never inf
330-
if np.isscalar(count):
331-
if count <= ddof:
332-
count = np.nan
333-
d = np.nan
334-
else:
335-
mask = count <= ddof
336-
if mask.any():
337-
np.putmask(d, mask, np.nan)
338-
np.putmask(count, mask, np.nan)
339-
340343
X = _ensure_numeric(values.sum(axis))
341344
XX = _ensure_numeric((values ** 2).sum(axis))
342345
return np.fabs((XX - X ** 2 / count) / d)
343346

347+
348+
def nansem(values, axis=None, skipna=True, ddof=1):
349+
var = nanvar(values, axis, skipna, ddof=ddof)
350+
351+
if not isinstance(values.dtype.type, np.floating):
352+
values = values.astype('f8')
353+
mask = isnull(values)
354+
count, _ = _get_counts_nanvar(mask, axis, ddof)
355+
356+
return np.sqrt(var)/np.sqrt(count)
357+
358+
344359
@bottleneck_switch()
345360
def nanmin(values, axis=None, skipna=True):
346361
values, mask, dtype, dtype_max = _get_values(values, skipna, fill_value_typ='+inf')

pandas/tests/test_frame.py

+17
Original file line numberDiff line numberDiff line change
@@ -10864,6 +10864,23 @@ def test_var_std(self):
1086410864
self.assertFalse((result < 0).any())
1086510865
nanops._USE_BOTTLENECK = True
1086610866

10867+
def test_sem(self):
10868+
alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
10869+
self._check_stat_op('sem', alt)
10870+
10871+
result = self.tsframe.sem(ddof=4)
10872+
expected = self.tsframe.apply(lambda x: x.std(ddof=4)/np.sqrt(len(x)))
10873+
assert_almost_equal(result, expected)
10874+
10875+
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
10876+
result = nanops.nansem(arr, axis=0)
10877+
self.assertFalse((result < 0).any())
10878+
if nanops._USE_BOTTLENECK:
10879+
nanops._USE_BOTTLENECK = False
10880+
result = nanops.nansem(arr, axis=0)
10881+
self.assertFalse((result < 0).any())
10882+
nanops._USE_BOTTLENECK = True
10883+
1086710884
def test_skew(self):
1086810885
_skip_if_no_scipy()
1086910886
from scipy.stats import skew

0 commit comments

Comments
 (0)