Skip to content

Commit e0da712

Browse files
committed
BUG: stat method refactoring to start addressing pandas-dev#382
1 parent 8d29268 commit e0da712

File tree

3 files changed

+163
-89
lines changed

3 files changed

+163
-89
lines changed

pandas/core/frame.py

+45-83
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from pandas.core.internals import BlockManager, make_block, form_blocks
3232
from pandas.core.series import Series, _is_bool_indexer
3333
from pandas.util import py3compat
34+
import pandas.core.nanops as nanops
3435
import pandas.core.common as common
3536
import pandas.core.datetools as datetools
3637
import pandas._tseries as lib
@@ -2710,57 +2711,36 @@ def _count_level(self, level, axis=0, numeric_only=False):
27102711
else:
27112712
return result
27122713

2713-
def sum(self, axis=0, numeric_only=True, skipna=True, level=None):
2714+
def sum(self, axis=0, numeric_only=None, skipna=True, level=None):
27142715
if level is not None:
27152716
return self._agg_by_level('sum', axis=axis, level=level,
27162717
skipna=skipna)
2717-
2718-
y, axis_labels = self._get_agg_data(axis, numeric_only=numeric_only)
2719-
2720-
if len(axis_labels) == 0:
2721-
return Series([], index=[])
2722-
2723-
if y.dtype == np.object_:
2724-
the_sum = y.sum(axis)
2725-
else:
2726-
mask = np.isfinite(y)
2727-
2728-
if skipna and not issubclass(y.dtype.type, np.integer):
2729-
np.putmask(y, -mask, 0)
2730-
2731-
the_sum = y.sum(axis)
2732-
the_count = mask.sum(axis)
2733-
2734-
ct_mask = the_count == 0
2735-
if ct_mask.any():
2736-
the_sum[ct_mask] = nan
2737-
2738-
return Series(the_sum, index=axis_labels)
2718+
return self._reduce(nanops.nansum, axis=axis, skipna=skipna,
2719+
numeric_only=numeric_only)
27392720
_add_stat_doc(sum, 'sum', 'sum', extras=_numeric_only_doc)
27402721

2722+
def mean(self, axis=0, skipna=True, level=None):
2723+
if level is not None:
2724+
return self._agg_by_level('mean', axis=axis, level=level,
2725+
skipna=skipna)
2726+
return self._reduce(nanops.nanmean, axis=axis, skipna=skipna,
2727+
numeric_only=None)
2728+
_add_stat_doc(mean, 'mean', 'mean')
2729+
27412730
def min(self, axis=0, skipna=True, level=None):
27422731
if level is not None:
27432732
return self._agg_by_level('min', axis=axis, level=level,
27442733
skipna=skipna)
2745-
2746-
values, axis_labels = self._get_agg_data(axis, numeric_only=True)
2747-
2748-
if skipna and not issubclass(values.dtype.type, np.integer):
2749-
np.putmask(values, -np.isfinite(values), np.inf)
2750-
2751-
return Series(values.min(axis), index=axis_labels)
2734+
return self._reduce(nanops.nanmin, axis=axis, skipna=skipna,
2735+
numeric_only=None)
27522736
_add_stat_doc(min, 'minimum', 'min')
27532737

27542738
def max(self, axis=0, skipna=True, level=None):
27552739
if level is not None:
27562740
return self._agg_by_level('max', axis=axis, level=level,
27572741
skipna=skipna)
2758-
2759-
values, axis_labels = self._get_agg_data(axis, numeric_only=True)
2760-
if skipna and not issubclass(values.dtype.type, np.integer):
2761-
np.putmask(values, -np.isfinite(values), -np.inf)
2762-
2763-
return Series(values.max(axis), index=axis_labels)
2742+
return self._reduce(nanops.nanmax, axis=axis, skipna=skipna,
2743+
numeric_only=None)
27642744
_add_stat_doc(max, 'maximum', 'max')
27652745

27662746
def prod(self, axis=0, skipna=True, level=None):
@@ -2781,16 +2761,6 @@ def prod(self, axis=0, skipna=True, level=None):
27812761
na_action='NA/null values are treated as 1')
27822762
product = prod
27832763

2784-
def mean(self, axis=0, skipna=True, level=None):
2785-
if level is not None:
2786-
return self._agg_by_level('mean', axis=axis, level=level,
2787-
skipna=skipna)
2788-
2789-
summed = self.sum(axis, numeric_only=True, skipna=skipna)
2790-
count = self.count(axis, numeric_only=True).astype(float)
2791-
return summed / count
2792-
_add_stat_doc(mean, 'mean', 'mean')
2793-
27942764
def median(self, axis=0, skipna=True, level=None):
27952765
if level is not None:
27962766
return self._agg_by_level('median', axis=axis, level=level,
@@ -2839,59 +2809,51 @@ def var(self, axis=0, skipna=True, level=None):
28392809
if level is not None:
28402810
return self._agg_by_level('var', axis=axis, level=level,
28412811
skipna=skipna)
2842-
2843-
y, axis_labels = self._get_agg_data(axis, numeric_only=True)
2844-
2845-
mask = np.isnan(y)
2846-
count = (y.shape[axis] - mask.sum(axis)).astype(float)
2847-
2848-
if skipna:
2849-
np.putmask(y, mask, 0)
2850-
2851-
X = y.sum(axis)
2852-
XX = (y ** 2).sum(axis)
2853-
2854-
theVar = (XX - X ** 2 / count) / (count - 1)
2855-
2856-
return Series(theVar, index=axis_labels)
2812+
return self._reduce(nanops.nanvar, axis=axis, skipna=skipna,
2813+
numeric_only=None)
28572814
_add_stat_doc(var, 'unbiased variance', 'var')
28582815

28592816
def std(self, axis=0, skipna=True, level=None):
28602817
if level is not None:
28612818
return self._agg_by_level('std', axis=axis, level=level,
28622819
skipna=skipna)
2863-
28642820
return np.sqrt(self.var(axis=axis, skipna=skipna))
28652821
_add_stat_doc(std, 'unbiased standard deviation', 'std')
28662822

28672823
def skew(self, axis=0, skipna=True, level=None):
28682824
if level is not None:
28692825
return self._agg_by_level('skew', axis=axis, level=level,
28702826
skipna=skipna)
2827+
return self._reduce(nanops.nanskew, axis=axis, skipna=skipna,
2828+
numeric_only=None)
2829+
_add_stat_doc(skew, 'unbiased skewness', 'skew')
28712830

2872-
y, axis_labels = self._get_agg_data(axis, numeric_only=True)
2873-
2874-
mask = np.isnan(y)
2875-
count = (y.shape[axis] - mask.sum(axis)).astype(float)
2876-
2877-
if skipna:
2878-
np.putmask(y, mask, 0)
2879-
2880-
A = y.sum(axis) / count
2881-
B = (y ** 2).sum(axis) / count - A ** 2
2882-
C = (y ** 3).sum(axis) / count - A ** 3 - 3 * A * B
2883-
2884-
# floating point error
2885-
B = np.where(np.abs(B) < 1e-14, 0, B)
2886-
C = np.where(np.abs(C) < 1e-14, 0, C)
2887-
2888-
result = ((np.sqrt((count ** 2 - count)) * C) /
2889-
((count - 2) * np.sqrt(B) ** 3))
2831+
def _reduce(self, op, axis=0, skipna=True, numeric_only=None):
28902832

2891-
result = np.where(B == 0, 0, result)
2833+
f = lambda x: op(x, axis=axis, skipna=skipna, copy=True)
2834+
labels = self._get_agg_axis(axis)
2835+
if numeric_only is None:
2836+
try:
2837+
values = self.values
2838+
if not self._is_mixed_type:
2839+
values = values.copy()
2840+
result = f(values)
2841+
except Exception:
2842+
data = self._get_numeric_data()
2843+
result = f(data.values)
2844+
labels = data._get_agg_axis(axis)
2845+
else:
2846+
if numeric_only:
2847+
data = self._get_numeric_data()
2848+
values = data.values
2849+
labels = data._get_agg_axis(axis)
2850+
else:
2851+
values = self.values
2852+
result = f(values)
28922853

2893-
return Series(result, index=axis_labels)
2894-
_add_stat_doc(skew, 'unbiased skewness', 'skew')
2854+
if result.dtype == np.object_:
2855+
result = result.astype('f8')
2856+
return Series(result, index=labels)
28952857

28962858
def idxmin(self, axis=0, skipna=True):
28972859
"""

pandas/core/nanops.py

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import numpy as np
2+
3+
from pandas.core.common import isnull, notnull
4+
5+
def nansum(values, axis=0, skipna=True, copy=True):
6+
if values.dtype == np.object_:
7+
the_sum = values.sum(axis)
8+
else:
9+
mask = notnull(values)
10+
11+
if skipna and not issubclass(values.dtype.type, np.integer):
12+
if copy:
13+
values = values.copy()
14+
np.putmask(values, -mask, 0)
15+
16+
the_sum = values.sum(axis)
17+
the_count = mask.sum(axis)
18+
19+
ct_mask = the_count == 0
20+
if ct_mask.any():
21+
the_sum[ct_mask] = np.nan
22+
23+
return the_sum
24+
25+
def nanmean(values, axis=0, skipna=True, copy=True):
26+
if values.dtype == np.object_:
27+
the_mean = values.sum(axis) / float(values.shape[axis])
28+
else:
29+
mask = notnull(values)
30+
31+
if skipna and not issubclass(values.dtype.type, np.integer):
32+
if copy:
33+
values = values.copy()
34+
np.putmask(values, -mask, 0)
35+
36+
the_sum = values.sum(axis)
37+
the_count = mask.sum(axis)
38+
the_mean = the_sum / the_count.astype('f8')
39+
40+
ct_mask = the_count == 0
41+
if ct_mask.any():
42+
the_mean[ct_mask] = np.nan
43+
44+
return the_mean
45+
46+
def nanvar(values, axis=0, skipna=True, copy=True):
47+
mask = isnull(values)
48+
count = (values.shape[axis] - mask.sum(axis)).astype(float)
49+
50+
if skipna:
51+
if copy:
52+
values = values.copy()
53+
np.putmask(values, mask, 0)
54+
55+
X = values.sum(axis)
56+
XX = (values ** 2).sum(axis)
57+
return (XX - X ** 2 / count) / (count - 1)
58+
59+
def nanskew(values, axis=0, skipna=True, copy=True):
60+
if not isinstance(values.dtype.type, np.floating):
61+
values = values.astype('f8')
62+
63+
mask = isnull(values)
64+
count = (values.shape[axis] - mask.sum(axis)).astype(float)
65+
66+
if skipna:
67+
if copy:
68+
values = values.copy()
69+
np.putmask(values, mask, 0)
70+
71+
A = values.sum(axis) / count
72+
B = (values ** 2).sum(axis) / count - A ** 2
73+
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
74+
75+
# floating point error
76+
B = np.where(np.abs(B) < 1e-14, 0, B)
77+
C = np.where(np.abs(C) < 1e-14, 0, C)
78+
79+
result = ((np.sqrt((count ** 2 - count)) * C) /
80+
((count - 2) * np.sqrt(B) ** 3))
81+
82+
result = np.where(B == 0, 0, result)
83+
84+
return result
85+
86+
def nanmin(values, axis=0, skipna=True, copy=True):
87+
if skipna and not issubclass(values.dtype.type, np.integer):
88+
if copy:
89+
values = values.copy()
90+
np.putmask(values, isnull(values), np.inf)
91+
return values.min(axis)
92+
93+
def nanmax(values, axis=0, skipna=True, copy=True):
94+
if skipna and not issubclass(values.dtype.type, np.integer):
95+
if copy:
96+
values = values.copy()
97+
np.putmask(values, isnull(values), -np.inf)
98+
99+
return values.max(axis)
100+

pandas/tests/test_frame.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -3002,12 +3002,6 @@ def test_get_numeric_data(self):
30023002
expected = df.ix[:, []]
30033003
assert_frame_equal(result, expected)
30043004

3005-
def test_statistics(self):
3006-
# unnecessary?
3007-
sumFrame = self.frame.apply(np.sum)
3008-
for col, series in self.frame.iteritems():
3009-
self.assertEqual(sumFrame[col], series.sum())
3010-
30113005
def test_count(self):
30123006
f = lambda s: notnull(s).sum()
30133007
self._check_stat_op('count', f, has_skipna=False)
@@ -3024,6 +3018,24 @@ def test_count(self):
30243018
def test_sum(self):
30253019
self._check_stat_op('sum', np.sum)
30263020

3021+
def test_stat_ops_attempt_obj_array(self):
3022+
data = {
3023+
'a': [-0.00049987540199591344, -0.0016467257772919831,
3024+
0.00067695870775883013],
3025+
'b': [-0, -0, 0.0],
3026+
'c': [0.00031111847529610595, 0.0014902627951905339,
3027+
-0.00094099200035979691]
3028+
}
3029+
df = DataFrame(data, index=['foo', 'bar', 'baz'],
3030+
dtype='O')
3031+
methods = ['sum', 'mean', 'var', 'std', 'skew', 'min', 'max']
3032+
3033+
for meth in methods:
3034+
self.assert_(df.values.dtype == np.object_)
3035+
result = getattr(df, meth)(1)
3036+
expected = getattr(df.astype('f8'), meth)(1)
3037+
assert_series_equal(result, expected)
3038+
30273039
def test_mean(self):
30283040
self._check_stat_op('mean', np.mean)
30293041

0 commit comments

Comments
 (0)