pandas-dev · jreback · Jun 5, 2014 · May 19, 2014 · May 16, 2014 · May 12, 2014
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -350,6 +350,7 @@ Computations / Descriptive Stats
    Series.prod
    Series.quantile
    Series.rank
+   Series.sem
    Series.skew
    Series.std
    Series.sum
@@ -642,6 +643,7 @@ Computations / Descriptive Stats
    DataFrame.prod
    DataFrame.quantile
    DataFrame.rank
+   DataFrame.sem
    DataFrame.skew
    DataFrame.sum
    DataFrame.std
@@ -895,6 +897,7 @@ Computations / Descriptive Stats
    Panel.min
    Panel.pct_change
    Panel.prod
+   Panel.sem
    Panel.skew
    Panel.sum
    Panel.std
@@ -1222,6 +1225,7 @@ Computations / Descriptive Stats
 
    GroupBy.mean
    GroupBy.median
+   GroupBy.sem
    GroupBy.std
    GroupBy.var
    GroupBy.ohlc

diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -428,6 +428,7 @@ optional ``level`` parameter which applies only if the object has a
     ``prod``, Product of values
     ``std``, Unbiased standard deviation
     ``var``, Unbiased variance
+    ``sem``, Unbiased standard error of the mean
     ``skew``, Unbiased skewness (3rd moment)
     ``kurt``, Unbiased kurtosis (4th moment)
     ``quantile``, Sample quantile (value at %)

diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -397,7 +397,7 @@ index are the group names and whose values are the sizes of each group.
    named *columns*.
 
    Aggregating functions are ones that reduce the dimension of the returned objects,
-   for example: ``mean, sum, size, count, std, var, describe, first, last, nth, min, max``. This is
+   for example: ``mean, sum, size, count, std, var, sem, describe, first, last, nth, min, max``. This is
    what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``.
 
    ``nth`` can act as a reducer *or* a filter, see :ref:`here <groupby.nth>`
@@ -457,7 +457,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching
 Cython-optimized aggregation functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Some common aggregations, currently only ``sum``, ``mean``, and ``std``, have
+Some common aggregations, currently only ``sum``, ``mean``, ``std``, and ``sem``, have
 optimized Cython implementations:
 
 .. ipython:: python

diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -974,8 +974,8 @@ an array and produces aggregated values:
    ts.resample('5Min', how=np.max)
 
 Any function available via :ref:`dispatching <groupby.dispatch>` can be given to
-the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``max``,
-``min``, ``median``, ``first``, ``last``, ``ohlc``.
+the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``sem``,
+``max``, ``min``, ``median``, ``first``, ``last``, ``ohlc``.
 
 For downsampling, ``closed`` can be set to 'left' or 'right' to specify which
 end of the interval is closed:

diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -73,6 +73,9 @@ Enhancements
 
   See :ref:`the docs <timeseries.timezone>`.
 
+- Implemented ``sem`` (standard error of the mean) operation for ``Series``,
+  ``DataFrame``, ``Panel``, and ``Groupby`` (:issue:`6897`)
+
 .. _whatsnew_0141.performance:
 
 Performance

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3794,7 +3794,8 @@ def mad(self,  axis=None, skipna=None, level=None, **kwargs):
 
         @Substitution(outname='variance',
                       desc="Return unbiased variance over requested "
-                           "axis\nNormalized by N-1")
+                           "axis.\n\nNormalized by N-1 by default. "
+                           "This can be changed using the ddof argument")
         @Appender(_num_doc)
         def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
             if skipna is None:
@@ -3811,7 +3812,8 @@ def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
 
         @Substitution(outname='stdev',
                       desc="Return unbiased standard deviation over requested "
-                           "axis\nNormalized by N-1")
+                           "axis.\n\nNormalized by N-1 by default. "
+                           "This can be changed using the ddof argument")
         @Appender(_num_doc)
         def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
             if skipna is None:
@@ -3827,6 +3829,24 @@ def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
             return np.sqrt(result)
         cls.std = std
 
+        @Substitution(outname='standarderror',
+                      desc="Return unbiased standard error of the mean over "
+                           "requested axis.\n\nNormalized by N-1 by default. "
+                           "This can be changed using the ddof argument")
+        @Appender(_num_doc)
+        def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
+            if skipna is None:
+                skipna = True
+            if axis is None:
+                axis = self._stat_axis_number
+            if level is not None:
+                return self._agg_by_level('sem', axis=axis, level=level,
+                                          skipna=skipna, ddof=ddof)
+
+            return self._reduce(nanops.nansem, axis=axis, skipna=skipna,
+                                ddof=ddof)
+        cls.sem = sem
+
         @Substitution(outname='compounded',
                       desc="Return the compound percentage of the values for "
                            "the requested axis")

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -486,7 +486,7 @@ def __getattr__(self, attr):
                              (type(self).__name__, attr))
 
     def __getitem__(self, key):
-        raise NotImplementedError
+        raise NotImplementedError('Not implemented: %s' % key)
 
     def _make_wrapper(self, name):
         if name not in self._apply_whitelist:
@@ -695,12 +695,7 @@ def std(self, ddof=1):
         For multiple groupings, the result index will be a MultiIndex
         """
         # todo, implement at cython level?
-        if ddof == 1:
-            return self._cython_agg_general('std')
-        else:
-            self._set_selection_from_grouper()
-            f = lambda x: x.std(ddof=ddof)
-            return self._python_agg_general(f)
+        return np.sqrt(self.var(ddof=ddof))
 
     def var(self, ddof=1):
         """
@@ -715,6 +710,14 @@ def var(self, ddof=1):
             f = lambda x: x.var(ddof=ddof)
             return self._python_agg_general(f)
 
+    def sem(self, ddof=1):
+        """
+        Compute standard error of the mean of groups, excluding missing values
+
+        For multiple groupings, the result index will be a MultiIndex
+        """
+        return self.std(ddof=ddof)/np.sqrt(self.count())
+
     def size(self):
         """
         Compute group sizes
@@ -1332,7 +1335,6 @@ def get_group_levels(self):
             'name': 'group_median'
         },
         'var': 'group_var',
-        'std': 'group_var',
         'first': {
             'name': 'group_nth',
             'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
@@ -1341,10 +1343,6 @@ def get_group_levels(self):
         'count': 'group_count',
     }
 
-    _cython_transforms = {
-        'std': np.sqrt,
-    }
-
     _cython_arity = {
         'ohlc': 4,  # OHLC
     }
@@ -1455,7 +1453,6 @@ def aggregate(self, values, how, axis=0):
 
     def _aggregate(self, result, counts, values, how, is_numeric):
         agg_func, dtype = self._get_aggregate_function(how, values)
-        trans_func = self._cython_transforms.get(how, lambda x: x)
 
         comp_ids, _, ngroups = self.group_info
         if values.ndim > 3:
@@ -1469,7 +1466,7 @@ def _aggregate(self, result, counts, values, how, is_numeric):
         else:
             agg_func(result, counts, values, comp_ids)
 
-        return trans_func(result)
+        return result
 
     def agg_series(self, obj, func):
         try:
@@ -1669,7 +1666,6 @@ def names(self):
         'min': 'group_min_bin',
         'max': 'group_max_bin',
         'var': 'group_var_bin',
-        'std': 'group_var_bin',
         'ohlc': 'group_ohlc',
         'first': {
             'name': 'group_nth_bin',
@@ -1688,7 +1684,6 @@ def names(self):
     def _aggregate(self, result, counts, values, how, is_numeric=True):
 
         agg_func, dtype = self._get_aggregate_function(how, values)
-        trans_func = self._cython_transforms.get(how, lambda x: x)
 
         if values.ndim > 3:
             # punting for now
@@ -1699,7 +1694,7 @@ def _aggregate(self, result, counts, values, how, is_numeric=True):
         else:
             agg_func(result, counts, values, self.bins)
 
-        return trans_func(result)
+        return result
 
     def agg_series(self, obj, func):
         dummy = obj[:0]

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -308,6 +308,24 @@ def get_median(x):
     return _wrap_results(get_median(values), dtype) if notempty else np.nan
 
 
+def _get_counts_nanvar(mask, axis, ddof):
+    count = _get_counts(mask, axis)
+
+    d = count-ddof
+
+    # always return NaN, never inf
+    if np.isscalar(count):
+        if count <= ddof:
+            count = np.nan
+            d = np.nan
+    else:
+        mask2 = count <= ddof
+        if mask2.any():
+            np.putmask(d, mask2, np.nan)
+            np.putmask(count, mask2, np.nan)
+    return count, d
+
+
 @disallow('M8')
 @bottleneck_switch(ddof=1)
 def nanvar(values, axis=None, skipna=True, ddof=1):
@@ -316,31 +334,28 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
 
     mask = isnull(values)
 
-    if axis is not None:
-        count = (values.shape[axis] - mask.sum(axis)).astype(float)
-    else:
-        count = float(values.size - mask.sum())
+    count, d = _get_counts_nanvar(mask, axis, ddof)
 
-    d = count-ddof
     if skipna:
         values = values.copy()
         np.putmask(values, mask, 0)
 
-    # always return NaN, never inf
-    if np.isscalar(count):
-        if count <= ddof:
-            count = np.nan
-            d = np.nan
-    else:
-        mask = count <= ddof
-        if mask.any():
-            np.putmask(d, mask, np.nan)
-            np.putmask(count, mask, np.nan)
-
     X = _ensure_numeric(values.sum(axis))
     XX = _ensure_numeric((values ** 2).sum(axis))
     return np.fabs((XX - X ** 2 / count) / d)
 
+
+def nansem(values, axis=None, skipna=True, ddof=1):
+    var = nanvar(values, axis, skipna, ddof=ddof)
+
+    if not isinstance(values.dtype.type, np.floating):
+        values = values.astype('f8')
+    mask = isnull(values)
+    count, _ = _get_counts_nanvar(mask, axis, ddof)
+
+    return np.sqrt(var)/np.sqrt(count)
+
+
 @bottleneck_switch()
 def nanmin(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, fill_value_typ='+inf')

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10864,6 +10864,23 @@ def test_var_std(self):
             self.assertFalse((result < 0).any())
             nanops._USE_BOTTLENECK = True
 
+    def test_sem(self):
+        alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
+        self._check_stat_op('sem', alt)
+
+        result = self.tsframe.sem(ddof=4)
+        expected = self.tsframe.apply(lambda x: x.std(ddof=4)/np.sqrt(len(x)))
+        assert_almost_equal(result, expected)
+
+        arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
+        result = nanops.nansem(arr, axis=0)
+        self.assertFalse((result < 0).any())
+        if nanops._USE_BOTTLENECK:
+            nanops._USE_BOTTLENECK = False
+            result = nanops.nansem(arr, axis=0)
+            self.assertFalse((result < 0).any())
+            nanops._USE_BOTTLENECK = True
+
     def test_skew(self):
         _skip_if_no_scipy()
         from scipy.stats import skew