add sem to nanops and pandas object apply methods

toddrjen · toddrjen · commit 2121b22ebfb5 · 2014-06-05T12:21:10.000+02:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -350,6 +350,7 @@ Computations / Descriptive Stats
    Series.prod
    Series.quantile
    Series.rank
+   Series.sem
    Series.skew
    Series.std
    Series.sum
@@ -642,6 +643,7 @@ Computations / Descriptive Stats
    DataFrame.prod
    DataFrame.quantile
    DataFrame.rank
+   DataFrame.sem
    DataFrame.skew
    DataFrame.sum
    DataFrame.std
@@ -895,6 +897,7 @@ Computations / Descriptive Stats
    Panel.min
    Panel.pct_change
    Panel.prod
+   Panel.sem
    Panel.skew
    Panel.sum
    Panel.std
@@ -1222,6 +1225,7 @@ Computations / Descriptive Stats
 
    GroupBy.mean
    GroupBy.median
+   GroupBy.sem
    GroupBy.std
    GroupBy.var
    GroupBy.ohlc
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -428,6 +428,7 @@ optional ``level`` parameter which applies only if the object has a
     ``prod``, Product of values
     ``std``, Unbiased standard deviation
     ``var``, Unbiased variance
+    ``sem``, Unbiased standard error of the mean
     ``skew``, Unbiased skewness (3rd moment)
     ``kurt``, Unbiased kurtosis (4th moment)
     ``quantile``, Sample quantile (value at %)
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -397,7 +397,7 @@ index are the group names and whose values are the sizes of each group.
    named *columns*.
 
    Aggregating functions are ones that reduce the dimension of the returned objects,
-   for example: ``mean, sum, size, count, std, var, describe, first, last, nth, min, max``. This is
+   for example: ``mean, sum, size, count, std, var, sem, describe, first, last, nth, min, max``. This is
    what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``.
 
    ``nth`` can act as a reducer *or* a filter, see :ref:`here <groupby.nth>`
@@ -457,7 +457,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching
 Cython-optimized aggregation functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Some common aggregations, currently only ``sum``, ``mean``, and ``std``, have
+Some common aggregations, currently only ``sum``, ``mean``, ``std``, and ``sem``, have
 optimized Cython implementations:
 
 .. ipython:: python
diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -974,8 +974,8 @@ an array and produces aggregated values:
    ts.resample('5Min', how=np.max)
 
 Any function available via :ref:`dispatching <groupby.dispatch>` can be given to
-the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``max``,
-``min``, ``median``, ``first``, ``last``, ``ohlc``.
+the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``sem``,
+``max``, ``min``, ``median``, ``first``, ``last``, ``ohlc``.
 
 For downsampling, ``closed`` can be set to 'left' or 'right' to specify which
 end of the interval is closed:
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -73,6 +73,9 @@ Enhancements
 
   See :ref:`the docs <timeseries.timezone>`.
 
+- Implemented ``sem`` (standard error of the mean) operation for ``Series``,
+  ``DataFrame``, ``Panel``, and ``Groupby`` (:issue:`6897`)
+
 .. _whatsnew_0141.performance:
 
 Performance
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3794,7 +3794,8 @@ def mad(self,  axis=None, skipna=None, level=None, **kwargs):
 
         @Substitution(outname='variance',
                       desc="Return unbiased variance over requested "
-                           "axis\nNormalized by N-1")
+                           "axis.\n\nNormalized by N-1 by default. "
+                           "This can be changed using the ddof argument")
         @Appender(_num_doc)
         def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
             if skipna is None:
@@ -3811,7 +3812,8 @@ def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
 
         @Substitution(outname='stdev',
                       desc="Return unbiased standard deviation over requested "
-                           "axis\nNormalized by N-1")
+                           "axis.\n\nNormalized by N-1 by default. "
+                           "This can be changed using the ddof argument")
         @Appender(_num_doc)
         def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
             if skipna is None:
@@ -3827,6 +3829,24 @@ def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
             return np.sqrt(result)
         cls.std = std
 
+        @Substitution(outname='standarderror',
+                      desc="Return unbiased standard error of the mean over "
+                           "requested axis.\n\nNormalized by N-1 by default. "
+                           "This can be changed using the ddof argument")
+        @Appender(_num_doc)
+        def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs):
+            if skipna is None:
+                skipna = True
+            if axis is None:
+                axis = self._stat_axis_number
+            if level is not None:
+                return self._agg_by_level('sem', axis=axis, level=level,
+                                          skipna=skipna, ddof=ddof)
+
+            return self._reduce(nanops.nansem, axis=axis, skipna=skipna,
+                                ddof=ddof)
+        cls.sem = sem
+
         @Substitution(outname='compounded',
                       desc="Return the compound percentage of the values for "
                            "the requested axis")
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -486,7 +486,7 @@ def __getattr__(self, attr):
                              (type(self).__name__, attr))
 
     def __getitem__(self, key):
-        raise NotImplementedError
+        raise NotImplementedError('Not implemented: %s' % key)
 
     def _make_wrapper(self, name):
         if name not in self._apply_whitelist:
@@ -710,6 +710,14 @@ def var(self, ddof=1):
             f = lambda x: x.var(ddof=ddof)
             return self._python_agg_general(f)
 
+    def sem(self, ddof=1):
+        """
+        Compute standard error of the mean of groups, excluding missing values
+
+        For multiple groupings, the result index will be a MultiIndex
+        """
+        return self.std(ddof=ddof)/np.sqrt(self.count())
+
     def size(self):
         """
         Compute group sizes
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -308,6 +308,24 @@ def get_median(x):
     return _wrap_results(get_median(values), dtype) if notempty else np.nan
 
 
+def _get_counts_nanvar(mask, axis, ddof):
+    count = _get_counts(mask, axis)
+
+    d = count-ddof
+
+    # always return NaN, never inf
+    if np.isscalar(count):
+        if count <= ddof:
+            count = np.nan
+            d = np.nan
+    else:
+        mask2 = count <= ddof
+        if mask2.any():
+            np.putmask(d, mask2, np.nan)
+            np.putmask(count, mask2, np.nan)
+    return count, d
+
+
 @disallow('M8')
 @bottleneck_switch(ddof=1)
 def nanvar(values, axis=None, skipna=True, ddof=1):
@@ -316,31 +334,28 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
 
     mask = isnull(values)
 
-    if axis is not None:
-        count = (values.shape[axis] - mask.sum(axis)).astype(float)
-    else:
-        count = float(values.size - mask.sum())
+    count, d = _get_counts_nanvar(mask, axis, ddof)
 
-    d = count-ddof
     if skipna:
         values = values.copy()
         np.putmask(values, mask, 0)
 
-    # always return NaN, never inf
-    if np.isscalar(count):
-        if count <= ddof:
-            count = np.nan
-            d = np.nan
-    else:
-        mask = count <= ddof
-        if mask.any():
-            np.putmask(d, mask, np.nan)
-            np.putmask(count, mask, np.nan)
-
     X = _ensure_numeric(values.sum(axis))
     XX = _ensure_numeric((values ** 2).sum(axis))
     return np.fabs((XX - X ** 2 / count) / d)
 
+
+def nansem(values, axis=None, skipna=True, ddof=1):
+    var = nanvar(values, axis, skipna, ddof=ddof)
+
+    if not isinstance(values.dtype.type, np.floating):
+        values = values.astype('f8')
+    mask = isnull(values)
+    count, _ = _get_counts_nanvar(mask, axis, ddof)
+
+    return np.sqrt(var)/np.sqrt(count)
+
+
 @bottleneck_switch()
 def nanmin(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, fill_value_typ='+inf')
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10864,6 +10864,23 @@ def test_var_std(self):
             self.assertFalse((result < 0).any())
             nanops._USE_BOTTLENECK = True
 
+    def test_sem(self):
+        alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
+        self._check_stat_op('sem', alt)
+
+        result = self.tsframe.sem(ddof=4)
+        expected = self.tsframe.apply(lambda x: x.std(ddof=4)/np.sqrt(len(x)))
+        assert_almost_equal(result, expected)
+
+        arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
+        result = nanops.nansem(arr, axis=0)
+        self.assertFalse((result < 0).any())
+        if nanops._USE_BOTTLENECK:
+            nanops._USE_BOTTLENECK = False
+            result = nanops.nansem(arr, axis=0)
+            self.assertFalse((result < 0).any())
+            nanops._USE_BOTTLENECK = True
+
     def test_skew(self):
         _skip_if_no_scipy()
         from scipy.stats import skew
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -681,11 +681,14 @@ def _check_results(grouped):
             assert_frame_equal(result, expected)
 
             # group frame by function dict
-            result = grouped.agg(
-                OrderedDict([['A', 'var'], ['B', 'std'], ['C', 'mean']]))
+            result = grouped.agg(OrderedDict([['A', 'var'],
+                                              ['B', 'std'],
+                                              ['C', 'mean'],
+                                              ['D', 'sem']]))
             expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
                                               ['B', grouped['B'].std()],
-                                              ['C', grouped['C'].mean()]]))
+                                              ['C', grouped['C'].mean()],
+                                              ['D', grouped['D'].sem()]]))
             assert_frame_equal(result, expected)
 
         by_weekday = self.tsframe.groupby(lambda x: x.weekday())
@@ -1637,6 +1640,7 @@ def _testit(op):
         _testit(lambda x: x.sum())
         _testit(lambda x: x.std())
         _testit(lambda x: x.var())
+        _testit(lambda x: x.sem())
         _testit(lambda x: x.mean())
         _testit(lambda x: x.median())
         _testit(lambda x: x.prod())
@@ -4170,8 +4174,8 @@ def test_tab_completion(self):
             'agg','aggregate','apply','boxplot','filter','first','get_group',
             'groups','hist','indices','last','max','mean','median',
             'min','name','ngroups','nth','ohlc','plot', 'prod',
-            'size', 'std', 'sum', 'transform', 'var', 'count', 'head', 'describe',
-            'cummax', 'quantile', 'rank', 'cumprod', 'tail',
+            'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'head',
+            'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail',
             'resample', 'cummin', 'fillna', 'cumsum', 'cumcount',
             'all', 'shift', 'skew', 'bfill', 'irow', 'ffill',
             'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
@@ -4347,6 +4351,12 @@ def test_ops_general(self):
                ('last', lambda x: x.iloc[-1]),
                ('count', np.size),
                ]
+        try:
+            from scipy.stats import sem
+        except ImportError:
+            pass
+        else:
+            ops.append(('sem', sem))
         df = DataFrame(np.random.randn(1000))
         labels = np.random.randint(0, 50, size=1000).astype(float)
 
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -1378,7 +1378,7 @@ def test_count(self):
         self.assertRaises(KeyError, frame.count, level='x')
 
     AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
-                     'mad', 'std', 'var']
+                     'mad', 'std', 'var', 'sem']
 
     def test_series_group_min_max(self):
         for op, level, skipna in cart_product(self.AGG_FUNCTIONS,
diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py
@@ -127,6 +127,13 @@ def alt(x):
             return np.std(x, ddof=1)
         self._check_stat_op('std', alt)
 
+    def test_sem(self):
+        def alt(x):
+            if len(x) < 2:
+                return np.nan
+            return np.std(x, ddof=1)/np.sqrt(len(x))
+        self._check_stat_op('sem', alt)
+
     # def test_skew(self):
     #     from scipy.stats import skew
 
diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py
@@ -98,6 +98,13 @@ def alt(x):
             return np.std(x, ddof=1)
         self._check_stat_op('std', alt)
 
+    def test_sem(self):
+        def alt(x):
+            if len(x) < 2:
+                return np.nan
+            return np.std(x, ddof=1)/np.sqrt(len(x))
+        self._check_stat_op('sem', alt)
+
     # def test_skew(self):
     #     from scipy.stats import skew
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -1980,6 +1980,19 @@ def test_var_std(self):
         result = s.std(ddof=1)
         self.assertTrue(isnull(result))
 
+    def test_sem(self):
+        alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
+        self._check_stat_op('sem', alt)
+
+        result = self.ts.sem(ddof=4)
+        expected = np.std(self.ts.values, ddof=4)/np.sqrt(len(self.ts.values))
+        assert_almost_equal(result, expected)
+
+        # 1 - element series with ddof=1
+        s = self.ts.iloc[[0]]
+        result = s.sem(ddof=1)
+        self.assert_(isnull(result))
+
     def test_skew(self):
         _skip_if_no_scipy()
 
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py