Skip to content

Commit fbb1102

Browse files
committed
ENH: use bottleneck for implemented nanops if installed, GH #91
1 parent f9f198e commit fbb1102

File tree

8 files changed

+140
-68
lines changed

8 files changed

+140
-68
lines changed

RELEASE.rst

+4
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ pandas 0.7.0
6060
and multiple Series to ``Series.append`` too
6161
- Added ``justify`` argument to ``DataFrame.to_string`` to allow different
6262
alignment of column headers
63+
- Add ``sort`` option to GroupBy to allow disabling sorting of the group keys
64+
for potential speedups (GH #595)
65+
- Can pass MaskedArray to Series constructor (PR #563)
66+
- Add Panel item access via attributes and IPython completion (GH #554)
6367

6468
**API Changes**
6569

TODO.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
meDONE
1+
DOCS 0.7.0
2+
----------
3+
- no sort in groupby
4+
- concat with dict
5+
6+
DONE
27
----
38
- SparseSeries name integration + tests
49
- Refactor Series.repr

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3210,7 +3210,7 @@ def _agg_by_level(self, name, axis=0, level=0, skipna=True):
32103210
return grouped.aggregate(applyf)
32113211

32123212
def _reduce(self, op, axis=0, skipna=True, numeric_only=None):
3213-
f = lambda x: op(x, axis=axis, skipna=skipna, copy=True)
3213+
f = lambda x: op(x, axis=axis, skipna=skipna)
32143214
labels = self._get_agg_axis(axis)
32153215
if numeric_only is None:
32163216
try:

pandas/core/nanops.py

+87-58
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,52 @@
1111
except ImportError: # pragma: no cover
1212
_USE_BOTTLENECK = False
1313

14-
def nansum(values, axis=None, skipna=True, copy=True):
14+
def _bottleneck_switch(bn_name, alt, **kwargs):
15+
bn_func = getattr(bn, bn_name)
16+
def f(values, axis=None, skipna=True):
17+
try:
18+
if _USE_BOTTLENECK and skipna:
19+
result = bn_func(values, axis=axis, **kwargs)
20+
# prefer to treat inf/-inf as NA
21+
if _has_infs(result):
22+
result = alt(values, axis=axis, skipna=skipna, **kwargs)
23+
else:
24+
result = alt(values, axis=axis, skipna=skipna, **kwargs)
25+
except Exception:
26+
result = alt(values, axis=axis, skipna=skipna, **kwargs)
27+
28+
return result
29+
30+
return f
31+
32+
def _has_infs(result):
33+
if isinstance(result, np.ndarray):
34+
if result.dtype == 'f8':
35+
return lib.has_infs_f8(result)
36+
elif result.dtype == 'f4':
37+
return lib.has_infs_f4(result)
38+
else: # pragma: no cover
39+
raise TypeError('Only suppose float32/64 here')
40+
else:
41+
return np.isinf(result) or np.isneginf(result)
42+
43+
def _nansum(values, axis=None, skipna=True):
1544
mask = isnull(values)
1645

1746
if skipna and not issubclass(values.dtype.type, np.integer):
18-
if copy:
19-
values = values.copy()
47+
values = values.copy()
2048
np.putmask(values, mask, 0)
2149

2250
the_sum = values.sum(axis)
2351
the_sum = _maybe_null_out(the_sum, axis, mask)
2452

2553
return the_sum
2654

27-
def nanmean(values, axis=None, skipna=True, copy=True):
55+
def _nanmean(values, axis=None, skipna=True):
2856
mask = isnull(values)
2957

3058
if skipna and not issubclass(values.dtype.type, np.integer):
31-
if copy:
32-
values = values.copy()
59+
values = values.copy()
3360
np.putmask(values, mask, 0)
3461

3562
the_sum = values.sum(axis)
@@ -44,7 +71,7 @@ def nanmean(values, axis=None, skipna=True, copy=True):
4471
the_mean = the_sum / count if count > 0 else np.nan
4572
return the_mean
4673

47-
def nanmedian(values, axis=None, skipna=True, copy=True):
74+
def _nanmedian(values, axis=None, skipna=True):
4875
def get_median(x):
4976
mask = notnull(x)
5077
if not skipna and not mask.all():
@@ -59,7 +86,7 @@ def get_median(x):
5986
else:
6087
return get_median(values)
6188

62-
def nanvar(values, axis=None, skipna=True, copy=True, ddof=1):
89+
def _nanvar(values, axis=None, skipna=True, ddof=1):
6390
mask = isnull(values)
6491

6592
if axis is not None:
@@ -68,52 +95,17 @@ def nanvar(values, axis=None, skipna=True, copy=True, ddof=1):
6895
count = float(values.size - mask.sum())
6996

7097
if skipna:
71-
if copy:
72-
values = values.copy()
98+
values = values.copy()
7399
np.putmask(values, mask, 0)
74100

75101
X = values.sum(axis)
76102
XX = (values ** 2).sum(axis)
77103
return (XX - X ** 2 / count) / (count - ddof)
78104

79-
def nanskew(values, axis=None, skipna=True, copy=True):
80-
if not isinstance(values.dtype.type, np.floating):
81-
values = values.astype('f8')
82-
83-
mask = isnull(values)
84-
count = _get_counts(mask, axis)
85-
86-
if skipna:
87-
if copy:
88-
values = values.copy()
89-
np.putmask(values, mask, 0)
90-
91-
A = values.sum(axis) / count
92-
B = (values ** 2).sum(axis) / count - A ** 2
93-
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
94-
95-
# floating point error
96-
B = _zero_out_fperr(B)
97-
C = _zero_out_fperr(C)
98-
99-
result = ((np.sqrt((count ** 2 - count)) * C) /
100-
((count - 2) * np.sqrt(B) ** 3))
101-
102-
if isinstance(result, np.ndarray):
103-
result = np.where(B == 0, 0, result)
104-
result[count < 3] = np.nan
105-
return result
106-
else:
107-
result = 0 if B == 0 else result
108-
if count < 3:
109-
return np.nan
110-
return result
111-
112-
def nanmin(values, axis=None, skipna=True, copy=True):
105+
def _nanmin(values, axis=None, skipna=True):
113106
mask = isnull(values)
114107
if skipna and not issubclass(values.dtype.type, np.integer):
115-
if copy:
116-
values = values.copy()
108+
values = values.copy()
117109
np.putmask(values, mask, np.inf)
118110
# numpy 1.6.1 workaround in Python 3.x
119111
if (values.dtype == np.object_
@@ -129,11 +121,10 @@ def nanmin(values, axis=None, skipna=True, copy=True):
129121

130122
return _maybe_null_out(result, axis, mask)
131123

132-
def nanmax(values, axis=None, skipna=True, copy=True):
124+
def _nanmax(values, axis=None, skipna=True):
133125
mask = isnull(values)
134126
if skipna and not issubclass(values.dtype.type, np.integer):
135-
if copy:
136-
values = values.copy()
127+
values = values.copy()
137128
np.putmask(values, mask, -np.inf)
138129
# numpy 1.6.1 workaround in Python 3.x
139130
if (values.dtype == np.object_
@@ -149,15 +140,6 @@ def nanmax(values, axis=None, skipna=True, copy=True):
149140
result = values.max(axis)
150141
return _maybe_null_out(result, axis, mask)
151142

152-
def nanprod(values, axis=None, skipna=True, copy=True):
153-
mask = isnull(values)
154-
if skipna and not issubclass(values.dtype.type, np.integer):
155-
if copy:
156-
values = values.copy()
157-
values[mask] = 1
158-
result = values.prod(axis)
159-
return _maybe_null_out(result, axis, mask)
160-
161143
def nanargmax(values, axis=None, skipna=True):
162144
"""
163145
Returns -1 in the NA case
@@ -182,6 +164,53 @@ def nanargmin(values, axis=None, skipna=True):
182164
result = _maybe_arg_null_out(result, axis, mask, skipna)
183165
return result
184166

167+
nansum = _bottleneck_switch('nansum', _nansum)
168+
nanmean = _bottleneck_switch('nanmean', _nanmean)
169+
nanmedian = _bottleneck_switch('nanmedian', _nanmedian)
170+
nanvar = _bottleneck_switch('nanvar', _nanvar, ddof=1)
171+
nanmin = _bottleneck_switch('nanmin', _nanmin)
172+
nanmax = _bottleneck_switch('nanmax', _nanmax)
173+
174+
def nanskew(values, axis=None, skipna=True):
175+
if not isinstance(values.dtype.type, np.floating):
176+
values = values.astype('f8')
177+
178+
mask = isnull(values)
179+
count = _get_counts(mask, axis)
180+
181+
if skipna:
182+
values = values.copy()
183+
np.putmask(values, mask, 0)
184+
185+
A = values.sum(axis) / count
186+
B = (values ** 2).sum(axis) / count - A ** 2
187+
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
188+
189+
# floating point error
190+
B = _zero_out_fperr(B)
191+
C = _zero_out_fperr(C)
192+
193+
result = ((np.sqrt((count ** 2 - count)) * C) /
194+
((count - 2) * np.sqrt(B) ** 3))
195+
196+
if isinstance(result, np.ndarray):
197+
result = np.where(B == 0, 0, result)
198+
result[count < 3] = np.nan
199+
return result
200+
else:
201+
result = 0 if B == 0 else result
202+
if count < 3:
203+
return np.nan
204+
return result
205+
206+
def nanprod(values, axis=None, skipna=True):
207+
mask = isnull(values)
208+
if skipna and not issubclass(values.dtype.type, np.integer):
209+
values = values.copy()
210+
values[mask] = 1
211+
result = values.prod(axis)
212+
return _maybe_null_out(result, axis, mask)
213+
185214
def _maybe_arg_null_out(result, axis, mask, skipna):
186215
# helper function for nanargmin/nanargmax
187216
if axis is None:

pandas/core/panel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ def apply(self, func, axis='major'):
934934
def _reduce(self, op, axis=0, skipna=True):
935935
axis_name = self._get_axis_name(axis)
936936
axis_number = self._get_axis_number(axis_name)
937-
f = lambda x: op(x, axis=axis_number, skipna=skipna, copy=True)
937+
f = lambda x: op(x, axis=axis_number, skipna=skipna)
938938

939939
result = f(self.values)
940940

pandas/core/series.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -737,7 +737,7 @@ def nunique(self):
737737
def sum(self, axis=0, dtype=None, out=None, skipna=True, level=None):
738738
if level is not None:
739739
return self._agg_by_level('sum', level=level, skipna=skipna)
740-
return nanops.nansum(self.values, skipna=skipna, copy=True)
740+
return nanops.nansum(self.values, skipna=skipna)
741741

742742
@Substitution(name='mean', shortname='mean', na_action=_doc_exclude_na,
743743
extras=_doc_ndarray_interface)
@@ -779,15 +779,15 @@ def prod(self, axis=None, dtype=None, out=None, skipna=True, level=None):
779779
def min(self, axis=None, out=None, skipna=True, level=None):
780780
if level is not None:
781781
return self._agg_by_level('min', level=level, skipna=skipna)
782-
return nanops.nanmin(self.values, skipna=skipna, copy=True)
782+
return nanops.nanmin(self.values, skipna=skipna)
783783

784784
@Substitution(name='maximum', shortname='max',
785785
na_action=_doc_exclude_na, extras='')
786786
@Appender(_stat_doc)
787787
def max(self, axis=None, out=None, skipna=True, level=None):
788788
if level is not None:
789789
return self._agg_by_level('max', level=level, skipna=skipna)
790-
return nanops.nanmax(self.values, skipna=skipna, copy=True)
790+
return nanops.nanmax(self.values, skipna=skipna)
791791

792792
@Substitution(name='unbiased standard deviation', shortname='stdev',
793793
na_action=_doc_exclude_na, extras='')
@@ -796,8 +796,7 @@ def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True,
796796
level=None):
797797
if level is not None:
798798
return self._agg_by_level('std', level=level, skipna=skipna)
799-
return np.sqrt(nanops.nanvar(self.values, skipna=skipna, copy=True,
800-
ddof=ddof))
799+
return np.sqrt(nanops.nanvar(self.values, skipna=skipna))
801800

802801
@Substitution(name='unbiased variance', shortname='var',
803802
na_action=_doc_exclude_na, extras='')
@@ -806,7 +805,7 @@ def var(self, axis=None, dtype=None, out=None, ddof=1, skipna=True,
806805
level=None):
807806
if level is not None:
808807
return self._agg_by_level('var', level=level, skipna=skipna)
809-
return nanops.nanvar(self.values, skipna=skipna, copy=True, ddof=ddof)
808+
return nanops.nanvar(self.values, skipna=skipna)
810809

811810
@Substitution(name='unbiased skewness', shortname='skew',
812811
na_action=_doc_exclude_na, extras='')
@@ -815,7 +814,7 @@ def skew(self, skipna=True, level=None):
815814
if level is not None:
816815
return self._agg_by_level('skew', level=level, skipna=skipna)
817816

818-
return nanops.nanskew(self.values, skipna=skipna, copy=True)
817+
return nanops.nanskew(self.values, skipna=skipna)
819818

820819
def _agg_by_level(self, name, level=0, skipna=True):
821820
grouped = self.groupby(level=level)

pandas/src/tseries.pyx

+28
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,34 @@ def fast_zip(list ndarrays):
424424

425425
return result
426426

427+
def has_infs_f4(ndarray[float32_t] arr):
428+
cdef:
429+
Py_ssize_t i, n = len(arr)
430+
float32_t inf, neginf, val
431+
432+
inf = np.inf
433+
neginf = -inf
434+
435+
for i in range(n):
436+
val = arr[i]
437+
if val == inf or val == neginf:
438+
return True
439+
return False
440+
441+
def has_infs_f8(ndarray[float64_t] arr):
442+
cdef:
443+
Py_ssize_t i, n = len(arr)
444+
float64_t inf, neginf, val
445+
446+
inf = np.inf
447+
neginf = -inf
448+
449+
for i in range(n):
450+
val = arr[i]
451+
if val == inf or val == neginf:
452+
return True
453+
return False
454+
427455
# cdef class TypeConverter:
428456
# cdef:
429457
# cpython.PyTypeObject* klass_type

pandas/tests/test_series.py

+7
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,13 @@ def test_iteritems(self):
634634
def test_sum(self):
635635
self._check_stat_op('sum', np.sum)
636636

637+
def test_sum_inf(self):
638+
s = Series(np.random.randn(10))
639+
s2 = s.copy()
640+
s[5:8] = np.inf
641+
s2[5:8] = np.nan
642+
assert_almost_equal(s.sum(), s2.sum())
643+
637644
def test_mean(self):
638645
self._check_stat_op('mean', np.mean)
639646

0 commit comments

Comments
 (0)