Merge pull request #9345 from iwschris/groupby_nano_int

jreback · jreback · commit 3f24b879e348 · 2015-02-13T22:10:28.000-05:00
BUG: Fixes GH9311 groupby on datetime64
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -187,6 +187,8 @@ Bug Fixes
 - Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`)
 - Bug in ``Categorical.__getitem__/__setitem__`` with listlike input getting incorrect results from indexer coercion (:issue:`9469`)
 - Bug in partial setting with a DatetimeIndex (:issue:`9478`)
+- Bug in groupby for integer and datetime64 columns when applying an aggregator that caused the value to be
+  changed when the number was sufficiently large (:issue:`9311`, :issue:`6620`)
 - Fixed bug in ``to_sql`` when mapping a ``Timestamp`` object column (datetime
   column with timezone info) to the according sqlalchemy type (:issue:`9085`).
 - Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -24,7 +24,8 @@
 from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
                                notnull, _DATELIKE_DTYPES, is_numeric_dtype,
                                is_timedelta64_dtype, is_datetime64_dtype,
-                               is_categorical_dtype, _values_from_object)
+                               is_categorical_dtype, _values_from_object,
+                               _is_datetime_or_timedelta_dtype, is_bool_dtype)
 from pandas.core.config import option_context
 import pandas.lib as lib
 from pandas.lib import Timestamp
@@ -1444,7 +1445,9 @@ def get_func(fname):
                 f = getattr(_algos, "%s_%s" % (fname, dtype_str), None)
                 if f is not None:
                     return f
-            return getattr(_algos, fname, None)
+
+            if dtype_str == 'float64':
+                return getattr(_algos, fname, None)
 
         ftype = self._cython_functions[how]
 
@@ -1471,7 +1474,6 @@ def wrapper(*args, **kwargs):
         return func, dtype_str
 
     def aggregate(self, values, how, axis=0):
-
         arity = self._cython_arity.get(how, 1)
 
         vdim = values.ndim
@@ -1487,27 +1489,44 @@ def aggregate(self, values, how, axis=0):
                 raise NotImplementedError
             out_shape = (self.ngroups,) + values.shape[1:]
 
-        if is_numeric_dtype(values.dtype):
-            values = com.ensure_float(values)
-            is_numeric = True
-            out_dtype = 'f%d' % values.dtype.itemsize
+        is_numeric = is_numeric_dtype(values.dtype)
+
+        if _is_datetime_or_timedelta_dtype(values.dtype):
+            values = values.view('int64')
+        elif is_bool_dtype(values.dtype):
+            values = _algos.ensure_float64(values)
+        elif com.is_integer_dtype(values):
+            values = values.astype('int64', copy=False)
+        elif is_numeric:
+            values = _algos.ensure_float64(values)
         else:
-            is_numeric = issubclass(values.dtype.type, (np.datetime64,
-                                                        np.timedelta64))
+            values = values.astype(object)
+
+        try:
+            agg_func, dtype_str = self._get_aggregate_function(how, values)
+        except NotImplementedError:
             if is_numeric:
-                out_dtype = 'float64'
-                values = values.view('int64')
+                values = _algos.ensure_float64(values)
+                agg_func, dtype_str = self._get_aggregate_function(how, values)
             else:
-                out_dtype = 'object'
-                values = values.astype(object)
+                raise
+
+        if is_numeric:
+            out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
+        else:
+            out_dtype = 'object'
 
         # will be filled in Cython function
         result = np.empty(out_shape, dtype=out_dtype)
-
         result.fill(np.nan)
         counts = np.zeros(self.ngroups, dtype=np.int64)
 
-        result = self._aggregate(result, counts, values, how, is_numeric)
+        result = self._aggregate(result, counts, values, agg_func, is_numeric)
+
+        if com.is_integer_dtype(result):
+            if len(result[result == tslib.iNaT]) > 0:
+                result = result.astype('float64')
+                result[result == tslib.iNaT] = np.nan
 
         if self._filter_empty_groups and not counts.all():
             if result.ndim == 2:
@@ -1535,9 +1554,7 @@ def aggregate(self, values, how, axis=0):
 
         return result, names
 
-    def _aggregate(self, result, counts, values, how, is_numeric):
-        agg_func, dtype = self._get_aggregate_function(how, values)
-
+    def _aggregate(self, result, counts, values, agg_func, is_numeric):
         comp_ids, _, ngroups = self.group_info
         if values.ndim > 3:
             # punting for now
@@ -1796,9 +1813,7 @@ def size(self):
         'ohlc': lambda *args: ['open', 'high', 'low', 'close']
     }
 
-    def _aggregate(self, result, counts, values, how, is_numeric=True):
-
-        agg_func, dtype = self._get_aggregate_function(how, values)
+    def _aggregate(self, result, counts, values, agg_func, is_numeric=True):
 
         if values.ndim > 3:
             # punting for now
@@ -2535,9 +2550,6 @@ def _cython_agg_blocks(self, how, numeric_only=True):
 
             values = block._try_operate(block.values)
 
-            if block.is_numeric:
-                values = _algos.ensure_float64(values)
-
             result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
 
             # see if we can cast the block back to the original dtype
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1811,10 +1811,7 @@ def _try_coerce_args(self, values, other):
     def _try_coerce_result(self, result):
         """ reverse of try_coerce_args """
         if isinstance(result, np.ndarray):
-            if result.dtype == 'i8':
-                result = tslib.array_to_datetime(
-                    result.astype(object).ravel()).reshape(result.shape)
-            elif result.dtype.kind in ['i', 'f', 'O']:
+            if result.dtype.kind in ['i', 'f', 'O']:
                 result = result.astype('M8[ns]')
         elif isinstance(result, (np.integer, np.datetime64)):
             result = lib.Timestamp(result)
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -3,6 +3,9 @@
 # don't introduce a pandas/pandas.compat import
 # or we get a bootstrapping problem
 from StringIO import StringIO
+import numpy as np
+
+_int64_max = np.iinfo(np.int64).max
 
 header = """
 cimport numpy as np
@@ -680,7 +683,7 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = resx[i, j]
 """
@@ -726,7 +729,7 @@ def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = resx[i, j]
 """
@@ -773,7 +776,7 @@ def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = resx[i, j]
 """
@@ -819,7 +822,7 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = resx[i, j]
 """
@@ -1278,7 +1281,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     nobs = np.zeros_like(out)
 
     minx = np.empty_like(out)
-    minx.fill(np.inf)
+    minx.fill(%(inf_val)s)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
@@ -1319,7 +1322,7 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = minx[i, j]
 """
@@ -1344,7 +1347,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     nobs = np.zeros_like(out)
 
     maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
+    maxx.fill(-%(inf_val)s)
 
     N, K = (<object> values).shape
 
@@ -1381,7 +1384,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = maxx[i, j]
 """
@@ -1402,7 +1405,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     nobs = np.zeros_like(out)
     maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
+    maxx.fill(-%(inf_val)s)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
@@ -1443,7 +1446,7 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = maxx[i, j]
 """
@@ -1469,7 +1472,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     nobs = np.zeros_like(out)
 
     minx = np.empty_like(out)
-    minx.fill(np.inf)
+    minx.fill(%(inf_val)s)
 
     N, K = (<object> values).shape
 
@@ -1506,7 +1509,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
-                out[i, j] = nan
+                out[i, j] = %(nan_val)s
             else:
                 out[i, j] = minx[i, j]
 """
@@ -2286,6 +2289,70 @@ def generate_put_template(template, use_ints=True, use_floats=True,
         output.write(func)
     return output.getvalue()
 
+def generate_put_min_max_template(template, use_ints=True, use_floats=True,
+                                  use_objects=False, use_datelikes=False):
+    floats_list = [
+        ('float64', 'float64_t', 'nan', 'np.inf'),
+        ('float32', 'float32_t', 'nan', 'np.inf'),
+    ]
+    ints_list = [
+        ('int64', 'int64_t', 'iNaT', _int64_max),
+    ]
+    date_like_list = [
+        ('int64', 'int64_t', 'iNaT', _int64_max),
+    ]
+    object_list = [('object', 'object', 'nan', 'np.inf')]
+    function_list = []
+    if use_floats:
+        function_list.extend(floats_list)
+    if use_ints:
+        function_list.extend(ints_list)
+    if use_objects:
+        function_list.extend(object_list)
+    if use_datelikes:
+        function_list.extend(date_like_list)
+
+    output = StringIO()
+    for name, dest_type, nan_val, inf_val in function_list:
+        func = template % {'name': name,
+                           'dest_type2': dest_type,
+                           'nan_val': nan_val,
+                           'inf_val': inf_val}
+        output.write(func)
+    return output.getvalue()
+
+def generate_put_selection_template(template, use_ints=True, use_floats=True,
+                                    use_objects=False, use_datelikes=False):
+    floats_list = [
+        ('float64', 'float64_t', 'float64_t', 'nan'),
+        ('float32', 'float32_t', 'float32_t', 'nan'),
+    ]
+    ints_list = [
+        ('int64', 'int64_t', 'int64_t', 'iNaT'),
+    ]
+    date_like_list = [
+        ('int64', 'int64_t', 'int64_t', 'iNaT'),
+    ]
+    object_list = [('object', 'object', 'object', 'nan')]
+    function_list = []
+    if use_floats:
+        function_list.extend(floats_list)
+    if use_ints:
+        function_list.extend(ints_list)
+    if use_objects:
+        function_list.extend(object_list)
+    if use_datelikes:
+        function_list.extend(date_like_list)
+
+    output = StringIO()
+    for name, c_type, dest_type, nan_val in function_list:
+        func = template % {'name': name,
+                           'c_type': c_type,
+                           'dest_type2': dest_type,
+                           'nan_val': nan_val}
+        output.write(func)
+    return output.getvalue()
+
 def generate_take_template(template, exclude=None):
     # name, dest, ctypein, ctypeout, preval, postval, cancopy
     function_list = [
@@ -2347,24 +2414,27 @@ def generate_from_template(template, exclude=None):
     return output.getvalue()
 
 put_2d = [diff_2d_template]
-groupbys = [group_last_template,
-            group_last_bin_template,
-            group_nth_template,
-            group_nth_bin_template,
-            group_add_template,
+
+groupbys = [group_add_template,
             group_add_bin_template,
             group_prod_template,
             group_prod_bin_template,
             group_var_template,
             group_var_bin_template,
             group_mean_template,
             group_mean_bin_template,
-            group_min_template,
-            group_min_bin_template,
-            group_max_template,
-            group_max_bin_template,
             group_ohlc_template]
 
+groupby_selection = [group_last_template,
+                     group_last_bin_template,
+                     group_nth_template,
+                     group_nth_bin_template]
+
+groupby_min_max = [group_min_template,
+                   group_min_bin_template,
+                   group_max_template,
+                   group_max_bin_template]
+
 groupby_count = [group_count_template, group_count_bin_template]
 
 templates_1d = [map_indices_template,
@@ -2407,9 +2477,18 @@ def generate_take_cython_file(path='generated.pyx'):
         for template in groupbys:
             print(generate_put_template(template, use_ints=False), file=f)
 
+        for template in groupby_selection:
+            print(generate_put_selection_template(template, use_ints=True),
+                  file=f)
+
+        for template in groupby_min_max:
+            print(generate_put_min_max_template(template, use_ints=True),
+                  file=f)
+
         for template in groupby_count:
-            print(generate_put_template(template, use_ints=False,
-                                        use_datelikes=True, use_objects=True),
+            print(generate_put_selection_template(template, use_ints=True,
+                                                  use_datelikes=True,
+                                                  use_objects=True),
                   file=f)
 
         # for template in templates_1d_datetime:
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py