From 5bc2973ef8ada09199cd381925737f9723f5aff1 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Wed, 13 Mar 2013 21:15:01 -0400
Subject: [PATCH] BUG: Bug in groupby with first/last where dtypes could change
 (GH3041_) DOC: docstring updates in core/common.py for
 _possibily_cast_to_timedelta

---
 RELEASE.rst                  |  5 +++--
 pandas/core/common.py        | 24 +++++++++++++++++++++++-
 pandas/core/groupby.py       |  4 ++++
 pandas/core/internals.py     | 21 +++++++--------------
 pandas/tests/test_groupby.py | 14 +++++++++-----
 5 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/RELEASE.rst b/RELEASE.rst
index b132b962fcd0e..2eb7980458f8e 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -145,8 +145,9 @@ pandas 0.11.0
     values (see GH2922_, GH2892_), also check for out-of-bounds indices (GH3029_)
   - Bug in DataFrame column insertion when the column creation fails, existing frame is left in
     an irrecoverable state (GH3010_)
-  - Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_)
-  - Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_)
+  - Bug in DataFrame update, combine_first where non-specified values could cause 
+    dtype changes (GH3016_, GH3041_)
+  - Bug in groupby with first/last where dtypes could change (GH3041_)
   - Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from 
     other values), (GH2850_)
   - Unstack of a frame with no nans would always cause dtype upcasting (GH2929_)
diff --git a/pandas/core/common.py b/pandas/core/common.py
index 17a2ccac5e30e..a3e8c09839891 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -777,6 +777,26 @@ def _possibly_cast_item(obj, item, dtype):
             raise ValueError("Unexpected dtype encountered: %s" % dtype)
 
 
+def _possibly_downcast_to_dtype(result, dtype):
+    """ try to cast to the specified dtype (e.g. convert back to bool/int
+        or could be an astype of float64->float32 """
+
+    if not isinstance(result, np.ndarray):
+        return result
+
+    try:
+        if dtype == np.float_:
+            return result.astype(dtype)
+        elif dtype == np.bool_ or dtype == np.int_:
+            if issubclass(result.dtype.type, np.number) and notnull(result).all():
+                new_result = result.astype(dtype)
+                if (new_result == result).all():
+                    return new_result
+    except:
+        pass
+
+    return result
+
 def _interp_wrapper(f, wrap_dtype, na_override=None):
     def wrapper(arr, mask, limit=None):
         view = arr.view(wrap_dtype)
@@ -936,7 +956,9 @@ def _possibly_convert_platform(values):
     return values
 
 def _possibly_cast_to_timedelta(value, coerce=True):
-    """ try to cast to timedelta64 w/o coercion """
+    """ try to cast to timedelta64, if already a timedeltalike, then make
+        sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
+        don't force the conversion unless coerce is True """
 
     # deal with numpy not being able to handle certain timedelta operations
     if isinstance(value,np.ndarray) and value.dtype.kind == 'm':
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index fe7c281afb1b9..3f12f773db96a 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -1594,6 +1594,10 @@ def _cython_agg_blocks(self, how, numeric_only=True):
                 values = com.ensure_float(values)
 
             result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
+
+            # see if we can cast the block back to the original dtype
+            result = block._try_cast_result(result)
+
             newb = make_block(result, block.items, block.ref_items)
             new_blocks.append(newb)
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 96cc41be26b92..2a41bbffa3b83 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -4,6 +4,7 @@
 from numpy import nan
 import numpy as np
 
+from pandas.core.common import _possibly_downcast_to_dtype
 from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
 from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
 import pandas.core.common as com
@@ -560,6 +561,9 @@ class NumericBlock(Block):
     is_numeric = True
     _can_hold_na = True
 
+    def _try_cast_result(self, result):
+        return _possibly_downcast_to_dtype(result, self.dtype)
+
 class FloatBlock(NumericBlock):
 
     def _can_hold_element(self, element):
@@ -608,20 +612,6 @@ def _try_cast(self, element):
         except:  # pragma: no cover
             return element
 
-    def _try_cast_result(self, result):
-        # this is quite restrictive to convert
-        try:
-            if (isinstance(result, np.ndarray) and
-                    issubclass(result.dtype.type, np.floating)):
-                if com.notnull(result).all():
-                    new_result = result.astype(self.dtype)
-                    if (new_result == result).all():
-                        return new_result
-        except:
-            pass
-
-        return result
-
     def should_store(self, value):
         return com.is_integer_dtype(value) and value.dtype == self.dtype
 
@@ -639,6 +629,9 @@ def _try_cast(self, element):
         except:  # pragma: no cover
             return element
 
+    def _try_cast_result(self, result):
+        return _possibly_downcast_to_dtype(result, self.dtype)
+
     def should_store(self, value):
         return issubclass(value.dtype.type, np.bool_)
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 4dde7eeea98ce..4b1770dd4f5df 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -163,21 +163,25 @@ def test_first_last_nth(self):
         self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
 
     def test_first_last_nth_dtypes(self):
-        # tests for first / last / nth
 
-        grouped = self.df_mixed_floats.groupby('A')
+        df = self.df_mixed_floats.copy()
+        df['E'] = True
+        df['F'] = 1
+
+        # tests for first / last / nth
+        grouped = df.groupby('A')
         first = grouped.first()
-        expected = self.df_mixed_floats.ix[[1, 0], ['B', 'C', 'D']]
+        expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
         expected.index = ['bar', 'foo']
         assert_frame_equal(first, expected, check_names=False)
 
         last = grouped.last()
-        expected = self.df_mixed_floats.ix[[5, 7], ['B', 'C', 'D']]
+        expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
         expected.index = ['bar', 'foo']
         assert_frame_equal(last, expected, check_names=False)
 
         nth = grouped.nth(1)
-        expected = self.df_mixed_floats.ix[[3, 2], ['B', 'C', 'D']]
+        expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
         expected.index = ['bar', 'foo']
         assert_frame_equal(nth, expected, check_names=False)