Skip to content

Commit d15088c

Browse files
committed
ENH: refactor Series stat ops to use common nanops, test pandas-dev#384 for Series
1 parent 157087b commit d15088c

File tree

4 files changed

+94
-131
lines changed

4 files changed

+94
-131
lines changed

RELEASE.rst

+6
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ pandas 0.6.0
101101
- Can pass multiple values columns to `pivot_table` (GH #381)
102102
- Can call `DataFrame.delevel` with standard Index with name set (GH #393)
103103
- Use Series name in GroupBy for result index (GH #363)
104+
- Refactor Series/DataFrame stat methods to use common set of NaN-friendly
105+
function
104106

105107
**Bug fixes**
106108

@@ -142,6 +144,9 @@ pandas 0.6.0
142144
- Don't lose index names in `MultiIndex.droplevel` (GH #394)
143145
- Infer more proper return type in `DataFrame.apply` when no columns or rows
144146
depending on whether the passed function is a reduction (GH #389)
147+
- Always return NA/NaN from Series.min/max and DataFrame.min/max when all of a
148+
row/column/values are NA (GH #384)
149+
145150

146151
Thanks
147152
------
@@ -157,6 +162,7 @@ Thanks
157162
- Wouter Overmeire
158163
- Nathan Pinger
159164
- Christian Prinoth
165+
- Skipper Seabold
160166
- Chang She
161167
- Ted Square
162168
- Aman Thakral

pandas/core/nanops.py

+75-45
Original file line numberDiff line numberDiff line change
@@ -3,48 +3,53 @@
33
from pandas.core.common import isnull, notnull
44
import pandas._tseries as lib
55

6-
def nansum(values, axis=0, skipna=True, copy=True):
6+
try:
7+
import bottleneck as bn
8+
_USE_BOTTLENECK = True
9+
except ImportError:
10+
_USE_BOTTLENECK = False
11+
12+
def nansum(values, axis=None, skipna=True, copy=True):
713
if values.dtype == np.object_:
814
the_sum = values.sum(axis)
915
else:
10-
mask = notnull(values)
16+
mask = isnull(values)
1117

1218
if skipna and not issubclass(values.dtype.type, np.integer):
1319
if copy:
1420
values = values.copy()
15-
np.putmask(values, -mask, 0)
21+
np.putmask(values, mask, 0)
1622

1723
the_sum = values.sum(axis)
18-
the_count = mask.sum(axis)
19-
20-
ct_mask = the_count == 0
21-
if ct_mask.any():
22-
the_sum[ct_mask] = np.nan
24+
the_sum = _maybe_null_out(the_sum, axis, mask)
2325

2426
return the_sum
2527

26-
def nanmean(values, axis=0, skipna=True, copy=True):
28+
def nanmean(values, axis=None, skipna=True, copy=True):
2729
if values.dtype == np.object_:
2830
the_mean = values.sum(axis) / float(values.shape[axis])
2931
else:
30-
mask = notnull(values)
32+
mask = isnull(values)
3133

3234
if skipna and not issubclass(values.dtype.type, np.integer):
3335
if copy:
3436
values = values.copy()
35-
np.putmask(values, -mask, 0)
37+
np.putmask(values, mask, 0)
3638

3739
the_sum = values.sum(axis)
38-
the_count = mask.sum(axis)
39-
the_mean = the_sum / the_count.astype('f8')
40+
count = _get_counts(mask, axis)
4041

41-
ct_mask = the_count == 0
42-
if ct_mask.any():
43-
the_mean[ct_mask] = np.nan
42+
if axis is not None:
43+
the_mean = the_sum / count
44+
ct_mask = count == 0
45+
if ct_mask.any():
46+
the_mean[ct_mask] = np.nan
47+
else:
48+
the_mean = the_sum / count if count > 0 else np.nan
4449

4550
return the_mean
4651

47-
def nanmedian(values, axis=0, skipna=True, copy=True):
52+
def nanmedian(values, axis=None, skipna=True, copy=True):
4853
def get_median(x):
4954
mask = notnull(x)
5055
if not skipna and not mask.all():
@@ -57,11 +62,18 @@ def get_median(x):
5762
if axis == 0:
5863
values = values.T
5964

60-
return np.asarray([get_median(arr) for arr in values])
65+
if values.ndim > 1:
66+
return np.asarray([get_median(arr) for arr in values])
67+
else:
68+
return get_median(values)
6169

62-
def nanvar(values, axis=0, skipna=True, copy=True):
70+
def nanvar(values, axis=None, skipna=True, copy=True, ddof=1):
6371
mask = isnull(values)
64-
count = (values.shape[axis] - mask.sum(axis)).astype(float)
72+
73+
if axis is not None:
74+
count = (values.shape[axis] - mask.sum(axis)).astype(float)
75+
else:
76+
count = float(values.size - mask.sum())
6577

6678
if skipna:
6779
if copy:
@@ -70,14 +82,14 @@ def nanvar(values, axis=0, skipna=True, copy=True):
7082

7183
X = values.sum(axis)
7284
XX = (values ** 2).sum(axis)
73-
return (XX - X ** 2 / count) / (count - 1)
85+
return (XX - X ** 2 / count) / (count - ddof)
7486

75-
def nanskew(values, axis=0, skipna=True, copy=True):
87+
def nanskew(values, axis=None, skipna=True, copy=True):
7688
if not isinstance(values.dtype.type, np.floating):
7789
values = values.astype('f8')
7890

7991
mask = isnull(values)
80-
count = (values.shape[axis] - mask.sum(axis)).astype(float)
92+
count = _get_counts(mask, axis)
8193

8294
if skipna:
8395
if copy:
@@ -89,49 +101,67 @@ def nanskew(values, axis=0, skipna=True, copy=True):
89101
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
90102

91103
# floating point error
92-
B = np.where(np.abs(B) < 1e-14, 0, B)
93-
C = np.where(np.abs(C) < 1e-14, 0, C)
104+
B = _zero_out_fperr(B)
105+
C = _zero_out_fperr(C)
94106

95107
result = ((np.sqrt((count ** 2 - count)) * C) /
96108
((count - 2) * np.sqrt(B) ** 3))
97109

98-
result = np.where(B == 0, 0, result)
99-
100-
return result
110+
if isinstance(result, np.ndarray):
111+
return np.where(B == 0, 0, result)
112+
else:
113+
return 0 if B == 0 else result
101114

102-
def nanmin(values, axis=0, skipna=True, copy=True):
115+
def nanmin(values, axis=None, skipna=True, copy=True):
103116
mask = isnull(values)
104117
if skipna and not issubclass(values.dtype.type, np.integer):
105118
if copy:
106119
values = values.copy()
107120
np.putmask(values, mask, np.inf)
108121
result = values.min(axis)
122+
return _maybe_null_out(result, axis, mask)
109123

110-
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
111-
if null_mask.any():
112-
result = result.astype('f8')
113-
result[null_mask] = np.nan
114-
return result
115-
116-
def nanmax(values, axis=0, skipna=True, copy=True):
124+
def nanmax(values, axis=None, skipna=True, copy=True):
117125
mask = isnull(values)
118126
if skipna and not issubclass(values.dtype.type, np.integer):
119127
if copy:
120128
values = values.copy()
121129
np.putmask(values, mask, -np.inf)
122130
result = values.max(axis)
131+
return _maybe_null_out(result, axis, mask)
123132

124-
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
125-
if null_mask.any():
126-
result = result.astype('f8')
127-
result[null_mask] = np.nan
128-
return result
129-
130-
def nanprod(values, axis=0, skipna=True, copy=True):
133+
def nanprod(values, axis=None, skipna=True, copy=True):
131134
mask = isnull(values)
132135
if skipna and not issubclass(values.dtype.type, np.integer):
136+
if copy:
137+
values = values.copy()
133138
values[mask] = 1
134139
result = values.prod(axis)
135-
count = mask.shape[axis] - mask.sum(axis)
136-
result[count == 0] = np.nan
140+
return _maybe_null_out(result, axis, mask)
141+
142+
def _get_counts(mask, axis):
143+
if axis is not None:
144+
count = (mask.shape[axis] - mask.sum(axis)).astype(float)
145+
else:
146+
count = float(mask.size - mask.sum())
147+
148+
return count
149+
150+
def _maybe_null_out(result, axis, mask):
151+
if axis is not None:
152+
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
153+
if null_mask.any():
154+
result = result.astype('f8')
155+
result[null_mask] = np.nan
156+
else:
157+
null_mask = mask.size - mask.sum()
158+
if null_mask == 0:
159+
result = np.nan
160+
137161
return result
162+
163+
def _zero_out_fperr(arg):
164+
if isinstance(arg, np.ndarray):
165+
return np.where(np.abs(arg) < 1e-14, 0, arg)
166+
else:
167+
return 0 if np.abs(arg) < 1e-14 else arg

pandas/core/series.py

+12-85
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from pandas.util import py3compat
2323
import pandas.core.common as common
2424
import pandas.core.datetools as datetools
25+
import pandas.core.nanops as nanops
2526
import pandas._tseries as lib
2627

2728
__all__ = ['Series', 'TimeSeries']
@@ -640,23 +641,13 @@ def nunique(self):
640641
def sum(self, axis=0, dtype=None, out=None, skipna=True, level=None):
641642
if level is not None:
642643
return self._agg_by_level('sum', level=level, skipna=skipna)
643-
644-
values = self.values.copy()
645-
646-
if skipna:
647-
mask = isnull(values)
648-
if mask.all():
649-
return np.nan
650-
np.putmask(values, mask, 0)
651-
652-
return values.sum()
644+
return nanops.nansum(self.values, skipna=skipna, copy=True)
653645
_add_stat_doc(sum, 'sum', 'sum', extras=_doc_ndarray_interface)
654646

655647
def mean(self, axis=0, dtype=None, out=None, skipna=True, level=None):
656648
if level is not None:
657649
return self._agg_by_level('mean', level=level, skipna=skipna)
658-
659-
return self._ndarray_statistic('mean', dtype=dtype, skipna=skipna)
650+
return nanops.nanmean(self.values, skipna=skipna)
660651
_add_stat_doc(mean, 'mean', 'mean', extras=_doc_ndarray_interface)
661652

662653
def mad(self, skipna=True, level=None):
@@ -670,99 +661,47 @@ def mad(self, skipna=True, level=None):
670661
def median(self, skipna=True, level=None):
671662
if level is not None:
672663
return self._agg_by_level('median', level=level, skipna=skipna)
673-
674-
arr = self.values
675-
if arr.dtype != np.float_:
676-
arr = arr.astype(float)
677-
mask = notnull(arr)
678-
679-
if skipna:
680-
arr = arr[mask]
681-
else:
682-
if not mask.all():
683-
return np.nan
684-
685-
return lib.median(arr)
664+
return nanops.nanmedian(self.values, skipna=skipna)
686665
_add_stat_doc(median, 'median', 'median')
687666

688-
def prod(self, axis=0, dtype=None, out=None, skipna=True, level=None):
667+
def prod(self, axis=None, dtype=None, out=None, skipna=True, level=None):
689668
if level is not None:
690669
return self._agg_by_level('prod', level=level, skipna=skipna)
691-
692-
return self._ndarray_statistic('prod', dtype=dtype, skipna=skipna)
670+
return nanops.nanprod(self.values, skipna=skipna)
693671
_add_stat_doc(prod, 'product', 'product')
694672

695673
def min(self, axis=None, out=None, skipna=True, level=None):
696674
if level is not None:
697675
return self._agg_by_level('min', level=level, skipna=skipna)
698-
699-
arr = self.values.copy()
700-
701-
if skipna:
702-
if not issubclass(arr.dtype.type, np.integer):
703-
np.putmask(arr, isnull(arr), np.inf)
704-
705-
return arr.min()
676+
return nanops.nanmin(self.values, skipna=skipna, copy=True)
706677
_add_stat_doc(min, 'minimum', 'min')
707678

708679
def max(self, axis=None, out=None, skipna=True, level=None):
709680
if level is not None:
710681
return self._agg_by_level('max', level=level, skipna=skipna)
711-
712-
arr = self.values.copy()
713-
714-
if skipna:
715-
if not issubclass(arr.dtype.type, np.integer):
716-
np.putmask(arr, isnull(arr), -np.inf)
717-
718-
return arr.max()
682+
return nanops.nanmax(self.values, skipna=skipna, copy=True)
719683
_add_stat_doc(max, 'maximum', 'max')
720684

721685
def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True,
722686
level=None):
723687
if level is not None:
724688
return self._agg_by_level('std', level=level, skipna=skipna)
725-
726-
if skipna:
727-
nona = remove_na(self.values)
728-
if len(nona) < 2:
729-
return nan
730-
return ndarray.std(nona, axis, dtype, out, ddof)
731-
732-
return self.values.std(axis, dtype, out, ddof)
689+
return np.sqrt(nanops.nanvar(self.values, skipna=skipna, copy=True,
690+
ddof=ddof))
733691
_add_stat_doc(std, 'unbiased standard deviation', 'stdev')
734692

735693
def var(self, axis=None, dtype=None, out=None, ddof=1, skipna=True,
736694
level=None):
737695
if level is not None:
738696
return self._agg_by_level('var', level=level, skipna=skipna)
739-
740-
if skipna:
741-
nona = remove_na(self.values)
742-
if len(nona) < 2:
743-
return nan
744-
return ndarray.var(nona, axis, dtype, out, ddof)
745-
746-
return self.values.var(axis, dtype, out, ddof)
697+
return nanops.nanvar(self.values, skipna=skipna, copy=True, ddof=ddof)
747698
_add_stat_doc(var, 'unbiased variance', 'var')
748699

749700
def skew(self, skipna=True, level=None):
750701
if level is not None:
751702
return self._agg_by_level('skew', level=level, skipna=skipna)
752703

753-
y = np.array(self.values)
754-
mask = notnull(y)
755-
count = mask.sum()
756-
757-
if count < len(self) and not skipna:
758-
return np.nan
759-
760-
np.putmask(y, -mask, 0)
761-
A = y.sum() / count
762-
B = (y**2).sum() / count - A**2
763-
C = (y**3).sum() / count - A**3 - 3*A*B
764-
765-
return (np.sqrt((count**2-count))*C) / ((count-2)*np.sqrt(B)**3)
704+
return nanops.nanskew(self.values, skipna=skipna, copy=True)
766705
_add_stat_doc(skew, 'unbiased skewness', 'skew')
767706

768707
def idxmin(self, axis=None, out=None, skipna=True):
@@ -803,18 +742,6 @@ def idxmax(self, axis=None, out=None, skipna=True):
803742
np.putmask(arr, isnull(arr), -np.inf)
804743
return self.index[arr.argmax()]
805744

806-
def _ndarray_statistic(self, funcname, dtype=None, skipna=True):
807-
arr = self.values
808-
retVal = getattr(arr, funcname)(dtype=dtype)
809-
810-
if skipna and isnull(retVal):
811-
arr = remove_na(arr)
812-
if len(arr) == 0:
813-
return np.nan
814-
retVal = getattr(arr, funcname)(dtype=dtype)
815-
816-
return retVal
817-
818745
def _agg_by_level(self, name, level=0, skipna=True):
819746
method = getattr(type(self), name)
820747
applyf = lambda x: method(x, skipna=skipna)

pandas/tests/test_series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ def _check_stat_op(self, name, alternate):
587587
assert_almost_equal(f(nona), alternate(nona))
588588

589589
allna = self.series * nan
590-
self.assert_(isnull(f(allna)))
590+
self.assert_(np.isnan(f(allna)))
591591

592592
def _check_accum_op(self, name):
593593
func = getattr(np, name)

0 commit comments

Comments
 (0)