BUG: cummin/cummax nans inf values for int64

mroeschke · mroeschke · commit b390fdbe8bb2 · 2017-01-24T00:04:57.000-08:00
migrate accum into cython

move datetime check into cython

mask int64 inf values before entering cython functions

Edit whatsnew and alter condition

add missing methods

edit space
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -365,7 +365,7 @@ Performance Improvements
 - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
 - Improved performance of timeseries plotting with an irregular DatetimeIndex
   (or with ``compat_x=True``) (:issue:`15073`).
-- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
+- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048` & `15109`)
 
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -25,6 +25,7 @@
                                  is_bool_dtype,
                                  is_scalar,
                                  is_list_like,
+                                 needs_i8_conversion,
                                  _ensure_float64,
                                  _ensure_platform_int,
                                  _ensure_int64,
@@ -1874,15 +1875,21 @@ def _cython_operation(self, kind, values, how, axis):
                                           "supported for the 'how' argument")
             out_shape = (self.ngroups,) + values.shape[1:]
 
+        is_datetimelike = needs_i8_conversion(values.dtype)
         is_numeric = is_numeric_dtype(values.dtype)
 
-        if is_datetime_or_timedelta_dtype(values.dtype):
+        if is_datetimelike:
             values = values.view('int64')
             is_numeric = True
         elif is_bool_dtype(values.dtype):
             values = _ensure_float64(values)
         elif is_integer_dtype(values):
-            values = values.astype('int64', copy=False)
+            # we use iNaT for the missing value on ints
+            # so pre-convert to guard this condition
+            if (values == tslib.iNaT).any():
+                values = _ensure_float64(values)
+            else:
+                values = values.astype('int64', copy=False)
         elif is_numeric and not is_complex_dtype(values):
             values = _ensure_float64(values)
         else:
@@ -1911,20 +1918,20 @@ def _cython_operation(self, kind, values, how, axis):
                                  fill_value=np.nan)
             counts = np.zeros(self.ngroups, dtype=np.int64)
             result = self._aggregate(
-                result, counts, values, labels, func, is_numeric)
+                result, counts, values, labels, func, is_numeric,
+                is_datetimelike)
         elif kind == 'transform':
             result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                  fill_value=np.nan)
 
-            # temporary storange for running-total type tranforms
-            accum = np.empty(out_shape, dtype=out_dtype)
             result = self._transform(
-                result, accum, values, labels, func, is_numeric)
+                result, values, labels, func, is_numeric, is_datetimelike)
 
         if is_integer_dtype(result):
-            if len(result[result == tslib.iNaT]) > 0:
+            mask = result == tslib.iNaT
+            if mask.any():
                 result = result.astype('float64')
-                result[result == tslib.iNaT] = np.nan
+                result[mask] = np.nan
 
         if kind == 'aggregate' and \
            self._filter_empty_groups and not counts.all():
@@ -1960,7 +1967,7 @@ def transform(self, values, how, axis=0):
         return self._cython_operation('transform', values, how, axis)
 
     def _aggregate(self, result, counts, values, comp_ids, agg_func,
-                   is_numeric):
+                   is_numeric, is_datetimelike):
         if values.ndim > 3:
             # punting for now
             raise NotImplementedError("number of dimensions is currently "
@@ -1975,8 +1982,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func,
 
         return result
 
-    def _transform(self, result, accum, values, comp_ids, transform_func,
-                   is_numeric):
+    def _transform(self, result, values, comp_ids, transform_func,
+                   is_numeric, is_datetimelike):
+
         comp_ids, _, ngroups = self.group_info
         if values.ndim > 3:
             # punting for now
@@ -1987,9 +1995,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func,
 
                 chunk = chunk.squeeze()
                 transform_func(result[:, :, i], values,
-                               comp_ids, accum)
+                               comp_ids, is_datetimelike)
         else:
-            transform_func(result, values, comp_ids, accum)
+            transform_func(result, values, comp_ids, is_datetimelike)
 
         return result
 
diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
@@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                           ndarray[{{dest_type2}}, ndim=2] values,
                           ndarray[int64_t] labels,
-                          ndarray[{{dest_type2}}, ndim=2] accum):
+                          bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         {{dest_type2}} val, min_val = 0
+        ndarray[{{dest_type2}}, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
+    accum = np.empty_like(values)
     accum.fill({{inf_val}})
 
     with nogil:
@@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                     accum[lab, j] = min_val
                     out[i, j] = accum[lab, j]
                 # val = nan
-                else:
+                elif is_datetimelike:
                     out[i, j] = {{nan_val}}
 
 
@@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                           ndarray[{{dest_type2}}, ndim=2] values,
                           ndarray[int64_t] labels,
-                          ndarray[{{dest_type2}}, ndim=2] accum):
+                          bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         {{dest_type2}} val, max_val = 0
+        ndarray[{{dest_type2}}, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
+    accum = np.empty_like(values)
     accum.fill(-{{inf_val}})
 
     with nogil:
@@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                     accum[lab, j] = max_val
                     out[i, j] = accum[lab, j]
                 # val = nan
-                else:
+                elif is_datetimelike:
                     out[i, j] = {{nan_val}}
 
 {{endfor}}
@@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
 def group_cumprod_float64(float64_t[:, :] out,
                           float64_t[:, :] values,
                           int64_t[:] labels,
-                          float64_t[:, :] accum):
+                          bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         float64_t val
+        float64_t[:, :] accum
         int64_t lab
 
     N, K = (<object> values).shape
-    accum = np.ones_like(accum)
+    accum = np.ones_like(values)
 
     with nogil:
         for i in range(N):
@@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out,
 def group_cumsum(numeric[:, :] out,
                  numeric[:, :] values,
                  int64_t[:] labels,
-                 numeric[:, :] accum):
+                 is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         numeric val
+        numeric[:, :] accum
         int64_t lab
 
     N, K = (<object> values).shape
-    accum = np.zeros_like(accum)
+    accum = np.zeros_like(values)
 
     with nogil:
         for i in range(N):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -5504,39 +5504,38 @@ def test_cython_group_transform_algos(self):
         ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]),
                (pd.algos.group_cumsum, np.cumsum, dtypes)]
 
+        is_datetimelike = False
         for pd_op, np_op, dtypes in ops:
             for dtype in dtypes:
                 data = np.array([[1], [2], [3], [4]], dtype=dtype)
                 ans = np.zeros_like(data)
-                accum = np.array([[0]], dtype=dtype)
                 labels = np.array([0, 0, 0, 0], dtype=np.int64)
-                pd_op(ans, data, labels, accum)
+                pd_op(ans, data, labels, is_datetimelike)
                 self.assert_numpy_array_equal(np_op(data), ans[:, 0],
                                               check_dtype=False)
 
         # with nans
         labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
 
         data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
-        accum = np.array([[0.0]])
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        pd.algos.group_cumprod_float64(actual, data, labels, accum)
+        pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
         expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
-        accum = np.array([[0.0]])
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        pd.algos.group_cumsum(actual, data, labels, accum)
+        pd.algos.group_cumsum(actual, data, labels, is_datetimelike)
         expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
         # timedelta
+        is_datetimelike = True
         data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
-        accum = np.array([[0]], dtype='int64')
         actual = np.zeros_like(data, dtype='int64')
-        pd.algos.group_cumsum(actual, data.view('int64'), labels, accum)
+        pd.algos.group_cumsum(actual, data.view('int64'), labels,
+                              is_datetimelike)
         expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
             2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
             np.timedelta64(5, 'ns')])
@@ -5962,12 +5961,9 @@ def test_cummin_cummax(self):
             df.loc[[2, 6], 'B'] = min_val
             expected.loc[[2, 3, 6, 7], 'B'] = min_val
             result = df.groupby('A').cummin()
-
-            # TODO: GH 15019
-            # overwriting NaNs
-            # tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result, expected)
             expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
-            # tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result, expected)
 
             # cummax
             expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
@@ -5980,13 +5976,9 @@ def test_cummin_cummax(self):
             df.loc[[2, 6], 'B'] = max_val
             expected.loc[[2, 3, 6, 7], 'B'] = max_val
             result = df.groupby('A').cummax()
-
-            # TODO: GH 15019
-            # overwriting NaNs
-            # tm.assert_frame_equal(result, expected)
-
+            tm.assert_frame_equal(result, expected)
             expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
-            # tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result, expected)
 
         # Test nan in some values
         base_df.loc[[0, 2, 4, 6], 'B'] = np.nan