BUG: groupby.first/last datetime64 type issue. close #2133

wesm · wesm · commit adc923895e69 · 2012-11-03T21:51:07.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -69,6 +69,7 @@ pandas 0.9.1
   - Fix partial integer indexing bug in DataFrame.xs (#2107)
   - Fix variety of cut/qcut string-bin formatting bugs (#1978, #1979)
   - Raise Exception when xs view not possible of MultiIndex'd DataFrame (#2117)
+  - Fix groupby(...).first() issue with datetime64 (#2133)
 
 pandas 0.9.0
 ============
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1437,11 +1437,12 @@ def convert_objects(self):
         converted : DataFrame
         """
         new_data = {}
+        convert_f = lambda x: lib.maybe_convert_objects(x, convert_datetime=1)
 
         # TODO: could be more efficient taking advantage of the block
         for col, s in self.iteritems():
             if s.dtype == np.object_:
-                new_data[col] = lib.maybe_convert_objects(s)
+                new_data[col] = convert_f(s)
             else:
                 new_data[col] = s
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -27,12 +27,16 @@ class SpecificationError(GroupByError):
     pass
 
 
-def _groupby_function(name, alias, npfunc, numeric_only=True):
+def _groupby_function(name, alias, npfunc, numeric_only=True,
+                      _convert=False):
     def f(self):
         try:
             return self._cython_agg_general(alias, numeric_only=numeric_only)
         except Exception:
-            return self.aggregate(lambda x: npfunc(x, axis=self.axis))
+            result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
+            if _convert:
+                result = result.convert_objects()
+            return result
 
     f.__doc__ = "Compute %s of group values" % name
     f.__name__ = name
@@ -41,19 +45,31 @@ def f(self):
 
 
 def _first_compat(x, axis=0):
-    x = np.asarray(x)
-    x = x[com.notnull(x)]
-    if len(x) == 0:
-        return np.nan
-    return x[0]
+    def _first(x):
+        x = np.asarray(x)
+        x = x[com.notnull(x)]
+        if len(x) == 0:
+            return np.nan
+        return x[0]
+
+    if isinstance(x, DataFrame):
+        return x.apply(_first, axis=axis)
+    else:
+        return _first(x)
 
 
 def _last_compat(x, axis=0):
-    x = np.asarray(x)
-    x = x[com.notnull(x)]
-    if len(x) == 0:
-        return np.nan
-    return x[-1]
+    def _last(x):
+        x = np.asarray(x)
+        x = x[com.notnull(x)]
+        if len(x) == 0:
+            return np.nan
+        return x[-1]
+
+    if isinstance(x, DataFrame):
+        return x.apply(_last, axis=axis)
+    else:
+        return _last(x)
 
 
 class GroupBy(object):
@@ -357,8 +373,9 @@ def size(self):
     min = _groupby_function('min', 'min', np.min)
     max = _groupby_function('max', 'max', np.max)
     first = _groupby_function('first', 'first', _first_compat,
-                              numeric_only=False)
-    last = _groupby_function('last', 'last', _last_compat, numeric_only=False)
+                              numeric_only=False, _convert=True)
+    last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
+                             _convert=True)
 
     def ohlc(self):
         """
@@ -380,7 +397,7 @@ def picker(arr):
     def _cython_agg_general(self, how, numeric_only=True):
         output = {}
         for name, obj in self._iterate_slices():
-            is_numeric = issubclass(obj.dtype.type, (np.number, np.bool_))
+            is_numeric = _is_numeric_dtype(obj.dtype)
             if numeric_only and not is_numeric:
                 continue
 
@@ -699,12 +716,6 @@ def get_group_levels(self):
     _filter_empty_groups = True
 
     def aggregate(self, values, how, axis=0):
-        values = com.ensure_float(values)
-        is_numeric = True
-
-        if not issubclass(values.dtype.type, (np.number, np.bool_)):
-            values = values.astype(object)
-            is_numeric = False
 
         arity = self._cython_arity.get(how, 1)
 
@@ -721,6 +732,16 @@ def aggregate(self, values, how, axis=0):
                 raise NotImplementedError
             out_shape = (self.ngroups,) + values.shape[1:]
 
+        if _is_numeric_dtype(values.dtype):
+            values = com.ensure_float(values)
+            is_numeric = True
+        else:
+            if issubclass(values.dtype.type, np.datetime64):
+                raise Exception('Cython not able to handle this case')
+
+            values = values.astype(object)
+            is_numeric = False
+
         # will be filled in Cython function
         result = np.empty(out_shape, dtype=values.dtype)
         counts = np.zeros(self.ngroups, dtype=np.int64)
@@ -753,10 +774,11 @@ def aggregate(self, values, how, axis=0):
         return result, names
 
     def _aggregate(self, result, counts, values, how, is_numeric):
-        fdict = self._cython_functions
         if not is_numeric:
-            fdict = self._cython_object_functions
-        agg_func = fdict[how]
+            agg_func = self._cython_object_functions[how]
+        else:
+            agg_func = self._cython_functions[how]
+
         trans_func = self._cython_transforms.get(how, lambda x: x)
 
         comp_ids, _, ngroups = self.group_info
@@ -1458,12 +1480,15 @@ def _cython_agg_blocks(self, how, numeric_only=True):
 
         for block in data.blocks:
             values = block.values
-            is_numeric = issubclass(values.dtype.type, (np.number, np.bool_))
+
+            is_numeric = _is_numeric_dtype(values.dtype)
+
             if numeric_only and not is_numeric:
                 continue
 
             if is_numeric:
                 values = com.ensure_float(values)
+
             result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
             newb = make_block(result, block.items, block.ref_items)
             new_blocks.append(newb)
@@ -2231,6 +2256,12 @@ def _reorder_by_uniques(uniques, labels):
 }
 
 
+def _is_numeric_dtype(dt):
+    typ = dt.type
+    return (issubclass(typ, (np.number, np.bool_))
+            and not issubclass(typ, (np.datetime64, np.timedelta64)))
+
+
 def _intercept_function(func):
     return _func_table.get(func, func)
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -359,7 +359,7 @@ def _try_cast(self, element):
             return element
 
     def should_store(self, value):
-        return issubclass(value.dtype.type, np.integer)
+        return com.is_integer_dtype(value)
 
 class BoolBlock(Block):
     _can_hold_na = False
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -431,7 +431,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
         elif util.is_complex_object(val):
             complexes[i] = val
             seen_complex = 1
-        elif PyDateTime_Check(val):
+        elif PyDateTime_Check(val) or util.is_datetime64_object(val):
             if convert_datetime:
                 seen_datetime = 1
                 idatetimes[i] = convert_to_tsobject(val).value
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2083,7 +2083,15 @@ def test_groupby_categorical_no_compress(self):
         exp = data.groupby(labels).mean().reindex(cats.levels)
         assert_series_equal(result, exp)
 
+    def test_groupby_first_datetime64(self):
+        df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
+        df[1] = df[1].view('M8[ns]')
 
+        self.assert_(issubclass(df[1].dtype.type, np.datetime64))
+
+        result = df.groupby(level=0).first()
+        got_dt = result[1].dtype
+        self.assert_(issubclass(got_dt.type, np.datetime64))
 
 
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):