From de9f3c1f58672bb63459638aab7f25ea3d1b7ad8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 25 Jul 2013 14:21:14 -0400 Subject: [PATCH] BUG: fix sum over integer frames --- doc/source/release.rst | 2 ++ doc/source/v0.13.0.txt | 3 +++ pandas/core/nanops.py | 13 +++++++------ vb_suite/stat_ops.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 7b174611652de..225d4fde8d5c0 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -54,6 +54,8 @@ pandas 0.13 representation of the index (:issue:`4136`) - Fix running of stata IO tests. Now uses temporary files to write (:issue:`4353`) + - Fixed an issue where ``DataFrame.sum`` was slower than ``DataFrame.mean`` + for integer valued frames (:issue:`4365`) pandas 0.12 =========== diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index e51206b3c2fe4..a55e9a0f35603 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -33,6 +33,9 @@ Bug Fixes - Fix running of stata IO tests. Now uses temporary files to write (:issue:`4353`) + - Fixed an issue where ``DataFrame.sum`` was slower than ``DataFrame.mean`` + for integer valued frames (:issue:`4365`) + See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0d940dc348dc1..b2ff366daa826 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -71,7 +71,9 @@ def f(values, axis=None, skipna=True, **kwds): if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype): result = bn_func(values, axis=axis, **kwds) - # prefer to treat inf/-inf as NA + + # prefer to treat inf/-inf as NA, but must compute the func + # twice :( if _has_infs(result): result = alt(values, axis=axis, skipna=skipna, **kwds) else: @@ -86,7 +88,8 @@ def f(values, axis=None, skipna=True, **kwds): def _bn_ok_dtype(dt): # Bottleneck chokes on datetime64 - return dt != np.object_ and not issubclass(dt.type, (np.datetime64,np.timedelta64)) + time_types = np.datetime64, np.timedelta64 + return dt != np.object_ and not issubclass(dt.type, time_types) def _has_infs(result): @@ -95,10 +98,8 @@ def _has_infs(result): return lib.has_infs_f8(result) elif result.dtype == 'f4': return lib.has_infs_f4(result) - else: # pragma: no cover - raise TypeError('Only suppose float32/64 here') - else: - return np.isinf(result) or np.isneginf(result) + return False + return np.isinf(result) or np.isneginf(result) def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): """ return the correct fill value for the dtype of the values """ diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py index 2bc7e04cc8848..91741eb3c3759 100644 --- a/vb_suite/stat_ops.py +++ b/vb_suite/stat_ops.py @@ -43,6 +43,35 @@ Benchmark("df[1].sum(level=[0, 1])", setup, repeat=1, start_date=datetime(2011, 11, 15)) +sum_setup = common_setup + """ +df = DataFrame(np.random.randn(100000, 4)) +dfi = DataFrame(np.random.randint(1000, size=df.shape)) +""" + +stat_ops_frame_sum_int_axis_0 = \ + Benchmark("dfi.sum()", sum_setup, start_date=datetime(2013, 7, 25)) + +stat_ops_frame_sum_float_axis_0 = \ + Benchmark("df.sum()", sum_setup, start_date=datetime(2013, 7, 25)) + +stat_ops_frame_mean_int_axis_0 = \ + Benchmark("dfi.mean()", sum_setup, start_date=datetime(2013, 7, 25)) + +stat_ops_frame_mean_float_axis_0 = \ + Benchmark("df.mean()", sum_setup, start_date=datetime(2013, 7, 25)) + +stat_ops_frame_sum_int_axis_1 = \ + Benchmark("dfi.sum(1)", sum_setup, start_date=datetime(2013, 7, 25)) + +stat_ops_frame_sum_float_axis_1 = \ + Benchmark("df.sum(1)", sum_setup, start_date=datetime(2013, 7, 25)) + +stat_ops_frame_mean_int_axis_1 = \ + Benchmark("dfi.mean(1)", sum_setup, start_date=datetime(2013, 7, 25)) + +stat_ops_frame_mean_float_axis_1 = \ + Benchmark("df.mean(1)", sum_setup, start_date=datetime(2013, 7, 25)) + #---------------------------------------------------------------------- # rank