From 5bc2973ef8ada09199cd381925737f9723f5aff1 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Mar 2013 21:15:01 -0400 Subject: [PATCH] BUG: Bug in groupby with first/last where dtypes could change (GH3041_) DOC: docstring updates in core/common.py for _possibily_cast_to_timedelta --- RELEASE.rst | 5 +++-- pandas/core/common.py | 24 +++++++++++++++++++++++- pandas/core/groupby.py | 4 ++++ pandas/core/internals.py | 21 +++++++-------------- pandas/tests/test_groupby.py | 14 +++++++++----- 5 files changed, 46 insertions(+), 22 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index b132b962fcd0e..2eb7980458f8e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -145,8 +145,9 @@ pandas 0.11.0 values (see GH2922_, GH2892_), also check for out-of-bounds indices (GH3029_) - Bug in DataFrame column insertion when the column creation fails, existing frame is left in an irrecoverable state (GH3010_) - - Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_) - - Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_) + - Bug in DataFrame update, combine_first where non-specified values could cause + dtype changes (GH3016_, GH3041_) + - Bug in groupby with first/last where dtypes could change (GH3041_) - Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from other values), (GH2850_) - Unstack of a frame with no nans would always cause dtype upcasting (GH2929_) diff --git a/pandas/core/common.py b/pandas/core/common.py index 17a2ccac5e30e..a3e8c09839891 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -777,6 +777,26 @@ def _possibly_cast_item(obj, item, dtype): raise ValueError("Unexpected dtype encountered: %s" % dtype) +def _possibly_downcast_to_dtype(result, dtype): + """ try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 """ + + if not isinstance(result, np.ndarray): + return result + + try: + if dtype == np.float_: + return result.astype(dtype) + elif dtype == np.bool_ or dtype == np.int_: + if issubclass(result.dtype.type, np.number) and notnull(result).all(): + new_result = result.astype(dtype) + if (new_result == result).all(): + return new_result + except: + pass + + return result + def _interp_wrapper(f, wrap_dtype, na_override=None): def wrapper(arr, mask, limit=None): view = arr.view(wrap_dtype) @@ -936,7 +956,9 @@ def _possibly_convert_platform(values): return values def _possibly_cast_to_timedelta(value, coerce=True): - """ try to cast to timedelta64 w/o coercion """ + """ try to cast to timedelta64, if already a timedeltalike, then make + sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards, + don't force the conversion unless coerce is True """ # deal with numpy not being able to handle certain timedelta operations if isinstance(value,np.ndarray) and value.dtype.kind == 'm': diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fe7c281afb1b9..3f12f773db96a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1594,6 +1594,10 @@ def _cython_agg_blocks(self, how, numeric_only=True): values = com.ensure_float(values) result, _ = self.grouper.aggregate(values, how, axis=agg_axis) + + # see if we can cast the block back to the original dtype + result = block._try_cast_result(result) + newb = make_block(result, block.items, block.ref_items) new_blocks.append(newb) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 96cc41be26b92..2a41bbffa3b83 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4,6 +4,7 @@ from numpy import nan import numpy as np +from pandas.core.common import _possibly_downcast_to_dtype from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices import pandas.core.common as com @@ -560,6 +561,9 @@ class NumericBlock(Block): is_numeric = True _can_hold_na = True + def _try_cast_result(self, result): + return _possibly_downcast_to_dtype(result, self.dtype) + class FloatBlock(NumericBlock): def _can_hold_element(self, element): @@ -608,20 +612,6 @@ def _try_cast(self, element): except: # pragma: no cover return element - def _try_cast_result(self, result): - # this is quite restrictive to convert - try: - if (isinstance(result, np.ndarray) and - issubclass(result.dtype.type, np.floating)): - if com.notnull(result).all(): - new_result = result.astype(self.dtype) - if (new_result == result).all(): - return new_result - except: - pass - - return result - def should_store(self, value): return com.is_integer_dtype(value) and value.dtype == self.dtype @@ -639,6 +629,9 @@ def _try_cast(self, element): except: # pragma: no cover return element + def _try_cast_result(self, result): + return _possibly_downcast_to_dtype(result, self.dtype) + def should_store(self, value): return issubclass(value.dtype.type, np.bool_) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4dde7eeea98ce..4b1770dd4f5df 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -163,21 +163,25 @@ def test_first_last_nth(self): self.assert_(com.isnull(grouped['B'].nth(0)['foo'])) def test_first_last_nth_dtypes(self): - # tests for first / last / nth - grouped = self.df_mixed_floats.groupby('A') + df = self.df_mixed_floats.copy() + df['E'] = True + df['F'] = 1 + + # tests for first / last / nth + grouped = df.groupby('A') first = grouped.first() - expected = self.df_mixed_floats.ix[[1, 0], ['B', 'C', 'D']] + expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']] expected.index = ['bar', 'foo'] assert_frame_equal(first, expected, check_names=False) last = grouped.last() - expected = self.df_mixed_floats.ix[[5, 7], ['B', 'C', 'D']] + expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']] expected.index = ['bar', 'foo'] assert_frame_equal(last, expected, check_names=False) nth = grouped.nth(1) - expected = self.df_mixed_floats.ix[[3, 2], ['B', 'C', 'D']] + expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']] expected.index = ['bar', 'foo'] assert_frame_equal(nth, expected, check_names=False)