Skip to content

BUG: groupby.cummin/cummax nans inf values for int64 #15205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ Performance Improvements
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
- Improved performance of timeseries plotting with an irregular DatetimeIndex
(or with ``compat_x=True``) (:issue:`15073`).
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)

- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.

Expand Down
35 changes: 21 additions & 14 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
is_timedelta64_dtype, is_datetime64_dtype,
is_categorical_dtype,
is_datetimelike,
is_datetime_or_timedelta_dtype,
is_datetime64_any_dtype,
is_bool, is_integer_dtype,
is_complex_dtype,
is_bool_dtype,
is_scalar,
is_list_like,
needs_i8_conversion,
_ensure_float64,
_ensure_platform_int,
_ensure_int64,
Expand Down Expand Up @@ -1876,15 +1876,21 @@ def _cython_operation(self, kind, values, how, axis):
"supported for the 'how' argument")
out_shape = (self.ngroups,) + values.shape[1:]

is_datetimelike = needs_i8_conversion(values.dtype)
is_numeric = is_numeric_dtype(values.dtype)

if is_datetime_or_timedelta_dtype(values.dtype):
if is_datetimelike:
values = values.view('int64')
is_numeric = True
elif is_bool_dtype(values.dtype):
values = _ensure_float64(values)
elif is_integer_dtype(values):
values = values.astype('int64', copy=False)
# we use iNaT for the missing value on ints
# so pre-convert to guard this condition
if (values == tslib.iNaT).any():
values = _ensure_float64(values)
else:
values = values.astype('int64', copy=False)
elif is_numeric and not is_complex_dtype(values):
values = _ensure_float64(values)
else:
Expand Down Expand Up @@ -1913,20 +1919,20 @@ def _cython_operation(self, kind, values, how, axis):
fill_value=np.nan)
counts = np.zeros(self.ngroups, dtype=np.int64)
result = self._aggregate(
result, counts, values, labels, func, is_numeric)
result, counts, values, labels, func, is_numeric,
is_datetimelike)
elif kind == 'transform':
result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
fill_value=np.nan)

# temporary storange for running-total type tranforms
accum = np.empty(out_shape, dtype=out_dtype)
result = self._transform(
result, accum, values, labels, func, is_numeric)
result, values, labels, func, is_numeric, is_datetimelike)

if is_integer_dtype(result):
if len(result[result == tslib.iNaT]) > 0:
mask = result == tslib.iNaT
if mask.any():
result = result.astype('float64')
result[result == tslib.iNaT] = np.nan
result[mask] = np.nan

if kind == 'aggregate' and \
self._filter_empty_groups and not counts.all():
Expand Down Expand Up @@ -1962,7 +1968,7 @@ def transform(self, values, how, axis=0):
return self._cython_operation('transform', values, how, axis)

def _aggregate(self, result, counts, values, comp_ids, agg_func,
is_numeric):
is_numeric, is_datetimelike):
if values.ndim > 3:
# punting for now
raise NotImplementedError("number of dimensions is currently "
Expand All @@ -1977,8 +1983,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func,

return result

def _transform(self, result, accum, values, comp_ids, transform_func,
is_numeric):
def _transform(self, result, values, comp_ids, transform_func,
is_numeric, is_datetimelike):

comp_ids, _, ngroups = self.group_info
if values.ndim > 3:
# punting for now
Expand All @@ -1989,9 +1996,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func,

chunk = chunk.squeeze()
transform_func(result[:, :, i], values,
comp_ids, accum)
comp_ids, is_datetimelike)
else:
transform_func(result, values, comp_ids, accum)
transform_func(result, values, comp_ids, is_datetimelike)

return result

Expand Down
22 changes: 14 additions & 8 deletions pandas/src/algos_groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels,
ndarray[{{dest_type2}}, ndim=2] accum):
bint is_datetimelike):
"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
{{dest_type2}} val, min_val = 0
ndarray[{{dest_type2}}, ndim=2] accum
int64_t lab

N, K = (<object> values).shape
accum = np.empty_like(values)
accum.fill({{inf_val}})

with nogil:
Expand All @@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
accum[lab, j] = min_val
out[i, j] = accum[lab, j]
# val = nan
else:
elif is_datetimelike:
out[i, j] = {{nan_val}}


Expand All @@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels,
ndarray[{{dest_type2}}, ndim=2] accum):
bint is_datetimelike):
"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
{{dest_type2}} val, max_val = 0
ndarray[{{dest_type2}}, ndim=2] accum
int64_t lab

N, K = (<object> values).shape
accum = np.empty_like(values)
accum.fill(-{{inf_val}})

with nogil:
Expand All @@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
accum[lab, j] = max_val
out[i, j] = accum[lab, j]
# val = nan
else:
elif is_datetimelike:
out[i, j] = {{nan_val}}

{{endfor}}
Expand Down Expand Up @@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
def group_cumprod_float64(float64_t[:, :] out,
float64_t[:, :] values,
int64_t[:] labels,
float64_t[:, :] accum):
bint is_datetimelike):
"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
float64_t val
float64_t[:, :] accum
int64_t lab

N, K = (<object> values).shape
accum = np.ones_like(accum)
accum = np.ones_like(values)

with nogil:
for i in range(N):
Expand All @@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out,
def group_cumsum(numeric[:, :] out,
numeric[:, :] values,
int64_t[:] labels,
numeric[:, :] accum):
is_datetimelike):
"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
numeric val
numeric[:, :] accum
int64_t lab

N, K = (<object> values).shape
accum = np.zeros_like(accum)
accum = np.zeros_like(values)

with nogil:
for i in range(N):
Expand Down
30 changes: 11 additions & 19 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -5507,39 +5507,38 @@ def test_cython_group_transform_algos(self):
ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]),
(pd.algos.group_cumsum, np.cumsum, dtypes)]

is_datetimelike = False
for pd_op, np_op, dtypes in ops:
for dtype in dtypes:
data = np.array([[1], [2], [3], [4]], dtype=dtype)
ans = np.zeros_like(data)
accum = np.array([[0]], dtype=dtype)
labels = np.array([0, 0, 0, 0], dtype=np.int64)
pd_op(ans, data, labels, accum)
pd_op(ans, data, labels, is_datetimelike)
self.assert_numpy_array_equal(np_op(data), ans[:, 0],
check_dtype=False)

# with nans
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)

data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
accum = np.array([[0.0]])
actual = np.zeros_like(data)
actual.fill(np.nan)
pd.algos.group_cumprod_float64(actual, data, labels, accum)
pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
self.assert_numpy_array_equal(actual[:, 0], expected)

accum = np.array([[0.0]])
actual = np.zeros_like(data)
actual.fill(np.nan)
pd.algos.group_cumsum(actual, data, labels, accum)
pd.algos.group_cumsum(actual, data, labels, is_datetimelike)
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
self.assert_numpy_array_equal(actual[:, 0], expected)

# timedelta
is_datetimelike = True
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
accum = np.array([[0]], dtype='int64')
actual = np.zeros_like(data, dtype='int64')
pd.algos.group_cumsum(actual, data.view('int64'), labels, accum)
pd.algos.group_cumsum(actual, data.view('int64'), labels,
is_datetimelike)
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
np.timedelta64(5, 'ns')])
Expand Down Expand Up @@ -5965,12 +5964,9 @@ def test_cummin_cummax(self):
df.loc[[2, 6], 'B'] = min_val
expected.loc[[2, 3, 6, 7], 'B'] = min_val
result = df.groupby('A').cummin()

# TODO: GH 15019
# overwriting NaNs
# tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
# tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# cummax
expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
Expand All @@ -5983,13 +5979,9 @@ def test_cummin_cummax(self):
df.loc[[2, 6], 'B'] = max_val
expected.loc[[2, 3, 6, 7], 'B'] = max_val
result = df.groupby('A').cummax()

# TODO: GH 15019
# overwriting NaNs
# tm.assert_frame_equal(result, expected)

tm.assert_frame_equal(result, expected)
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
# tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
Expand Down