Skip to content

Commit ba05744

Browse files
mroeschkejreback
authored andcommitted
BUG: groupby.cummin/cummax nans inf values for int64
closes #15109 Author: Matt Roeschke <[email protected]> Closes #15205 from mroeschke/fix_15109 and squashes the following commits: 717afb4 [Matt Roeschke] BUG: cummin/cummax nans inf values for int64
1 parent 5f40950 commit ba05744

File tree

4 files changed

+47
-42
lines changed

4 files changed

+47
-42
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ Performance Improvements
367367
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
368368
- Improved performance of timeseries plotting with an irregular DatetimeIndex
369369
(or with ``compat_x=True``) (:issue:`15073`).
370-
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
370+
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
371371

372372
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
373373

pandas/core/groupby.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
is_timedelta64_dtype, is_datetime64_dtype,
1919
is_categorical_dtype,
2020
is_datetimelike,
21-
is_datetime_or_timedelta_dtype,
2221
is_datetime64_any_dtype,
2322
is_bool, is_integer_dtype,
2423
is_complex_dtype,
2524
is_bool_dtype,
2625
is_scalar,
2726
is_list_like,
27+
needs_i8_conversion,
2828
_ensure_float64,
2929
_ensure_platform_int,
3030
_ensure_int64,
@@ -1876,15 +1876,21 @@ def _cython_operation(self, kind, values, how, axis):
18761876
"supported for the 'how' argument")
18771877
out_shape = (self.ngroups,) + values.shape[1:]
18781878

1879+
is_datetimelike = needs_i8_conversion(values.dtype)
18791880
is_numeric = is_numeric_dtype(values.dtype)
18801881

1881-
if is_datetime_or_timedelta_dtype(values.dtype):
1882+
if is_datetimelike:
18821883
values = values.view('int64')
18831884
is_numeric = True
18841885
elif is_bool_dtype(values.dtype):
18851886
values = _ensure_float64(values)
18861887
elif is_integer_dtype(values):
1887-
values = values.astype('int64', copy=False)
1888+
# we use iNaT for the missing value on ints
1889+
# so pre-convert to guard this condition
1890+
if (values == tslib.iNaT).any():
1891+
values = _ensure_float64(values)
1892+
else:
1893+
values = values.astype('int64', copy=False)
18881894
elif is_numeric and not is_complex_dtype(values):
18891895
values = _ensure_float64(values)
18901896
else:
@@ -1913,20 +1919,20 @@ def _cython_operation(self, kind, values, how, axis):
19131919
fill_value=np.nan)
19141920
counts = np.zeros(self.ngroups, dtype=np.int64)
19151921
result = self._aggregate(
1916-
result, counts, values, labels, func, is_numeric)
1922+
result, counts, values, labels, func, is_numeric,
1923+
is_datetimelike)
19171924
elif kind == 'transform':
19181925
result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
19191926
fill_value=np.nan)
19201927

1921-
# temporary storange for running-total type tranforms
1922-
accum = np.empty(out_shape, dtype=out_dtype)
19231928
result = self._transform(
1924-
result, accum, values, labels, func, is_numeric)
1929+
result, values, labels, func, is_numeric, is_datetimelike)
19251930

19261931
if is_integer_dtype(result):
1927-
if len(result[result == tslib.iNaT]) > 0:
1932+
mask = result == tslib.iNaT
1933+
if mask.any():
19281934
result = result.astype('float64')
1929-
result[result == tslib.iNaT] = np.nan
1935+
result[mask] = np.nan
19301936

19311937
if kind == 'aggregate' and \
19321938
self._filter_empty_groups and not counts.all():
@@ -1962,7 +1968,7 @@ def transform(self, values, how, axis=0):
19621968
return self._cython_operation('transform', values, how, axis)
19631969

19641970
def _aggregate(self, result, counts, values, comp_ids, agg_func,
1965-
is_numeric):
1971+
is_numeric, is_datetimelike):
19661972
if values.ndim > 3:
19671973
# punting for now
19681974
raise NotImplementedError("number of dimensions is currently "
@@ -1977,8 +1983,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func,
19771983

19781984
return result
19791985

1980-
def _transform(self, result, accum, values, comp_ids, transform_func,
1981-
is_numeric):
1986+
def _transform(self, result, values, comp_ids, transform_func,
1987+
is_numeric, is_datetimelike):
1988+
19821989
comp_ids, _, ngroups = self.group_info
19831990
if values.ndim > 3:
19841991
# punting for now
@@ -1989,9 +1996,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func,
19891996

19901997
chunk = chunk.squeeze()
19911998
transform_func(result[:, :, i], values,
1992-
comp_ids, accum)
1999+
comp_ids, is_datetimelike)
19932000
else:
1994-
transform_func(result, values, comp_ids, accum)
2001+
transform_func(result, values, comp_ids, is_datetimelike)
19952002

19962003
return result
19972004

pandas/src/algos_groupby_helper.pxi.in

+14-8
Original file line numberDiff line numberDiff line change
@@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
574574
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
575575
ndarray[{{dest_type2}}, ndim=2] values,
576576
ndarray[int64_t] labels,
577-
ndarray[{{dest_type2}}, ndim=2] accum):
577+
bint is_datetimelike):
578578
"""
579579
Only transforms on axis=0
580580
"""
581581
cdef:
582582
Py_ssize_t i, j, N, K, size
583583
{{dest_type2}} val, min_val = 0
584+
ndarray[{{dest_type2}}, ndim=2] accum
584585
int64_t lab
585586

586587
N, K = (<object> values).shape
588+
accum = np.empty_like(values)
587589
accum.fill({{inf_val}})
588590

589591
with nogil:
@@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
600602
accum[lab, j] = min_val
601603
out[i, j] = accum[lab, j]
602604
# val = nan
603-
else:
605+
elif is_datetimelike:
604606
out[i, j] = {{nan_val}}
605607

606608

@@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
609611
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
610612
ndarray[{{dest_type2}}, ndim=2] values,
611613
ndarray[int64_t] labels,
612-
ndarray[{{dest_type2}}, ndim=2] accum):
614+
bint is_datetimelike):
613615
"""
614616
Only transforms on axis=0
615617
"""
616618
cdef:
617619
Py_ssize_t i, j, N, K, size
618620
{{dest_type2}} val, max_val = 0
621+
ndarray[{{dest_type2}}, ndim=2] accum
619622
int64_t lab
620623

621624
N, K = (<object> values).shape
625+
accum = np.empty_like(values)
622626
accum.fill(-{{inf_val}})
623627

624628
with nogil:
@@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
635639
accum[lab, j] = max_val
636640
out[i, j] = accum[lab, j]
637641
# val = nan
638-
else:
642+
elif is_datetimelike:
639643
out[i, j] = {{nan_val}}
640644

641645
{{endfor}}
@@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
682686
def group_cumprod_float64(float64_t[:, :] out,
683687
float64_t[:, :] values,
684688
int64_t[:] labels,
685-
float64_t[:, :] accum):
689+
bint is_datetimelike):
686690
"""
687691
Only transforms on axis=0
688692
"""
689693
cdef:
690694
Py_ssize_t i, j, N, K, size
691695
float64_t val
696+
float64_t[:, :] accum
692697
int64_t lab
693698

694699
N, K = (<object> values).shape
695-
accum = np.ones_like(accum)
700+
accum = np.ones_like(values)
696701

697702
with nogil:
698703
for i in range(N):
@@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out,
712717
def group_cumsum(numeric[:, :] out,
713718
numeric[:, :] values,
714719
int64_t[:] labels,
715-
numeric[:, :] accum):
720+
is_datetimelike):
716721
"""
717722
Only transforms on axis=0
718723
"""
719724
cdef:
720725
Py_ssize_t i, j, N, K, size
721726
numeric val
727+
numeric[:, :] accum
722728
int64_t lab
723729

724730
N, K = (<object> values).shape
725-
accum = np.zeros_like(accum)
731+
accum = np.zeros_like(values)
726732

727733
with nogil:
728734
for i in range(N):

pandas/tests/groupby/test_groupby.py

+11-19
Original file line numberDiff line numberDiff line change
@@ -5507,39 +5507,38 @@ def test_cython_group_transform_algos(self):
55075507
ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]),
55085508
(pd.algos.group_cumsum, np.cumsum, dtypes)]
55095509

5510+
is_datetimelike = False
55105511
for pd_op, np_op, dtypes in ops:
55115512
for dtype in dtypes:
55125513
data = np.array([[1], [2], [3], [4]], dtype=dtype)
55135514
ans = np.zeros_like(data)
5514-
accum = np.array([[0]], dtype=dtype)
55155515
labels = np.array([0, 0, 0, 0], dtype=np.int64)
5516-
pd_op(ans, data, labels, accum)
5516+
pd_op(ans, data, labels, is_datetimelike)
55175517
self.assert_numpy_array_equal(np_op(data), ans[:, 0],
55185518
check_dtype=False)
55195519

55205520
# with nans
55215521
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
55225522

55235523
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
5524-
accum = np.array([[0.0]])
55255524
actual = np.zeros_like(data)
55265525
actual.fill(np.nan)
5527-
pd.algos.group_cumprod_float64(actual, data, labels, accum)
5526+
pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
55285527
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
55295528
self.assert_numpy_array_equal(actual[:, 0], expected)
55305529

5531-
accum = np.array([[0.0]])
55325530
actual = np.zeros_like(data)
55335531
actual.fill(np.nan)
5534-
pd.algos.group_cumsum(actual, data, labels, accum)
5532+
pd.algos.group_cumsum(actual, data, labels, is_datetimelike)
55355533
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
55365534
self.assert_numpy_array_equal(actual[:, 0], expected)
55375535

55385536
# timedelta
5537+
is_datetimelike = True
55395538
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
5540-
accum = np.array([[0]], dtype='int64')
55415539
actual = np.zeros_like(data, dtype='int64')
5542-
pd.algos.group_cumsum(actual, data.view('int64'), labels, accum)
5540+
pd.algos.group_cumsum(actual, data.view('int64'), labels,
5541+
is_datetimelike)
55435542
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
55445543
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
55455544
np.timedelta64(5, 'ns')])
@@ -5965,12 +5964,9 @@ def test_cummin_cummax(self):
59655964
df.loc[[2, 6], 'B'] = min_val
59665965
expected.loc[[2, 3, 6, 7], 'B'] = min_val
59675966
result = df.groupby('A').cummin()
5968-
5969-
# TODO: GH 15019
5970-
# overwriting NaNs
5971-
# tm.assert_frame_equal(result, expected)
5967+
tm.assert_frame_equal(result, expected)
59725968
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5973-
# tm.assert_frame_equal(result, expected)
5969+
tm.assert_frame_equal(result, expected)
59745970

59755971
# cummax
59765972
expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
@@ -5983,13 +5979,9 @@ def test_cummin_cummax(self):
59835979
df.loc[[2, 6], 'B'] = max_val
59845980
expected.loc[[2, 3, 6, 7], 'B'] = max_val
59855981
result = df.groupby('A').cummax()
5986-
5987-
# TODO: GH 15019
5988-
# overwriting NaNs
5989-
# tm.assert_frame_equal(result, expected)
5990-
5982+
tm.assert_frame_equal(result, expected)
59915983
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5992-
# tm.assert_frame_equal(result, expected)
5984+
tm.assert_frame_equal(result, expected)
59935985

59945986
# Test nan in some values
59955987
base_df.loc[[0, 2, 4, 6], 'B'] = np.nan

0 commit comments

Comments
 (0)