Skip to content

Commit 717afb4

Browse files
committed
BUG: cummin/cummax nans inf values for int64
migrate accum into cython move datetime check into cython mask int64 inf values before entering cython functions Edit whatsnew and alter condition add missing methods edit space Linting & whatsnew edit
1 parent 64d7670 commit 717afb4

File tree

4 files changed

+47
-42
lines changed

4 files changed

+47
-42
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ Performance Improvements
365365
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
366366
- Improved performance of timeseries plotting with an irregular DatetimeIndex
367367
(or with ``compat_x=True``) (:issue:`15073`).
368-
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
368+
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
369369

370370
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
371371

pandas/core/groupby.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
is_timedelta64_dtype, is_datetime64_dtype,
1919
is_categorical_dtype,
2020
is_datetimelike,
21-
is_datetime_or_timedelta_dtype,
2221
is_datetime64_any_dtype,
2322
is_bool, is_integer_dtype,
2423
is_complex_dtype,
2524
is_bool_dtype,
2625
is_scalar,
2726
is_list_like,
27+
needs_i8_conversion,
2828
_ensure_float64,
2929
_ensure_platform_int,
3030
_ensure_int64,
@@ -1876,15 +1876,21 @@ def _cython_operation(self, kind, values, how, axis):
18761876
"supported for the 'how' argument")
18771877
out_shape = (self.ngroups,) + values.shape[1:]
18781878

1879+
is_datetimelike = needs_i8_conversion(values.dtype)
18791880
is_numeric = is_numeric_dtype(values.dtype)
18801881

1881-
if is_datetime_or_timedelta_dtype(values.dtype):
1882+
if is_datetimelike:
18821883
values = values.view('int64')
18831884
is_numeric = True
18841885
elif is_bool_dtype(values.dtype):
18851886
values = _ensure_float64(values)
18861887
elif is_integer_dtype(values):
1887-
values = values.astype('int64', copy=False)
1888+
# we use iNaT for the missing value on ints
1889+
# so pre-convert to guard this condition
1890+
if (values == tslib.iNaT).any():
1891+
values = _ensure_float64(values)
1892+
else:
1893+
values = values.astype('int64', copy=False)
18881894
elif is_numeric and not is_complex_dtype(values):
18891895
values = _ensure_float64(values)
18901896
else:
@@ -1913,20 +1919,20 @@ def _cython_operation(self, kind, values, how, axis):
19131919
fill_value=np.nan)
19141920
counts = np.zeros(self.ngroups, dtype=np.int64)
19151921
result = self._aggregate(
1916-
result, counts, values, labels, func, is_numeric)
1922+
result, counts, values, labels, func, is_numeric,
1923+
is_datetimelike)
19171924
elif kind == 'transform':
19181925
result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
19191926
fill_value=np.nan)
19201927

1921-
# temporary storange for running-total type tranforms
1922-
accum = np.empty(out_shape, dtype=out_dtype)
19231928
result = self._transform(
1924-
result, accum, values, labels, func, is_numeric)
1929+
result, values, labels, func, is_numeric, is_datetimelike)
19251930

19261931
if is_integer_dtype(result):
1927-
if len(result[result == tslib.iNaT]) > 0:
1932+
mask = result == tslib.iNaT
1933+
if mask.any():
19281934
result = result.astype('float64')
1929-
result[result == tslib.iNaT] = np.nan
1935+
result[mask] = np.nan
19301936

19311937
if kind == 'aggregate' and \
19321938
self._filter_empty_groups and not counts.all():
@@ -1962,7 +1968,7 @@ def transform(self, values, how, axis=0):
19621968
return self._cython_operation('transform', values, how, axis)
19631969

19641970
def _aggregate(self, result, counts, values, comp_ids, agg_func,
1965-
is_numeric):
1971+
is_numeric, is_datetimelike):
19661972
if values.ndim > 3:
19671973
# punting for now
19681974
raise NotImplementedError("number of dimensions is currently "
@@ -1977,8 +1983,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func,
19771983

19781984
return result
19791985

1980-
def _transform(self, result, accum, values, comp_ids, transform_func,
1981-
is_numeric):
1986+
def _transform(self, result, values, comp_ids, transform_func,
1987+
is_numeric, is_datetimelike):
1988+
19821989
comp_ids, _, ngroups = self.group_info
19831990
if values.ndim > 3:
19841991
# punting for now
@@ -1989,9 +1996,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func,
19891996

19901997
chunk = chunk.squeeze()
19911998
transform_func(result[:, :, i], values,
1992-
comp_ids, accum)
1999+
comp_ids, is_datetimelike)
19932000
else:
1994-
transform_func(result, values, comp_ids, accum)
2001+
transform_func(result, values, comp_ids, is_datetimelike)
19952002

19962003
return result
19972004

pandas/src/algos_groupby_helper.pxi.in

+14-8
Original file line numberDiff line numberDiff line change
@@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
574574
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
575575
ndarray[{{dest_type2}}, ndim=2] values,
576576
ndarray[int64_t] labels,
577-
ndarray[{{dest_type2}}, ndim=2] accum):
577+
bint is_datetimelike):
578578
"""
579579
Only transforms on axis=0
580580
"""
581581
cdef:
582582
Py_ssize_t i, j, N, K, size
583583
{{dest_type2}} val, min_val = 0
584+
ndarray[{{dest_type2}}, ndim=2] accum
584585
int64_t lab
585586

586587
N, K = (<object> values).shape
588+
accum = np.empty_like(values)
587589
accum.fill({{inf_val}})
588590

589591
with nogil:
@@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
600602
accum[lab, j] = min_val
601603
out[i, j] = accum[lab, j]
602604
# val = nan
603-
else:
605+
elif is_datetimelike:
604606
out[i, j] = {{nan_val}}
605607

606608

@@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
609611
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
610612
ndarray[{{dest_type2}}, ndim=2] values,
611613
ndarray[int64_t] labels,
612-
ndarray[{{dest_type2}}, ndim=2] accum):
614+
bint is_datetimelike):
613615
"""
614616
Only transforms on axis=0
615617
"""
616618
cdef:
617619
Py_ssize_t i, j, N, K, size
618620
{{dest_type2}} val, max_val = 0
621+
ndarray[{{dest_type2}}, ndim=2] accum
619622
int64_t lab
620623

621624
N, K = (<object> values).shape
625+
accum = np.empty_like(values)
622626
accum.fill(-{{inf_val}})
623627

624628
with nogil:
@@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
635639
accum[lab, j] = max_val
636640
out[i, j] = accum[lab, j]
637641
# val = nan
638-
else:
642+
elif is_datetimelike:
639643
out[i, j] = {{nan_val}}
640644

641645
{{endfor}}
@@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
682686
def group_cumprod_float64(float64_t[:, :] out,
683687
float64_t[:, :] values,
684688
int64_t[:] labels,
685-
float64_t[:, :] accum):
689+
bint is_datetimelike):
686690
"""
687691
Only transforms on axis=0
688692
"""
689693
cdef:
690694
Py_ssize_t i, j, N, K, size
691695
float64_t val
696+
float64_t[:, :] accum
692697
int64_t lab
693698

694699
N, K = (<object> values).shape
695-
accum = np.ones_like(accum)
700+
accum = np.ones_like(values)
696701

697702
with nogil:
698703
for i in range(N):
@@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out,
712717
def group_cumsum(numeric[:, :] out,
713718
numeric[:, :] values,
714719
int64_t[:] labels,
715-
numeric[:, :] accum):
720+
is_datetimelike):
716721
"""
717722
Only transforms on axis=0
718723
"""
719724
cdef:
720725
Py_ssize_t i, j, N, K, size
721726
numeric val
727+
numeric[:, :] accum
722728
int64_t lab
723729

724730
N, K = (<object> values).shape
725-
accum = np.zeros_like(accum)
731+
accum = np.zeros_like(values)
726732

727733
with nogil:
728734
for i in range(N):

pandas/tests/groupby/test_groupby.py

+11-19
Original file line numberDiff line numberDiff line change
@@ -5507,39 +5507,38 @@ def test_cython_group_transform_algos(self):
55075507
ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]),
55085508
(pd.algos.group_cumsum, np.cumsum, dtypes)]
55095509

5510+
is_datetimelike = False
55105511
for pd_op, np_op, dtypes in ops:
55115512
for dtype in dtypes:
55125513
data = np.array([[1], [2], [3], [4]], dtype=dtype)
55135514
ans = np.zeros_like(data)
5514-
accum = np.array([[0]], dtype=dtype)
55155515
labels = np.array([0, 0, 0, 0], dtype=np.int64)
5516-
pd_op(ans, data, labels, accum)
5516+
pd_op(ans, data, labels, is_datetimelike)
55175517
self.assert_numpy_array_equal(np_op(data), ans[:, 0],
55185518
check_dtype=False)
55195519

55205520
# with nans
55215521
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
55225522

55235523
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
5524-
accum = np.array([[0.0]])
55255524
actual = np.zeros_like(data)
55265525
actual.fill(np.nan)
5527-
pd.algos.group_cumprod_float64(actual, data, labels, accum)
5526+
pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
55285527
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
55295528
self.assert_numpy_array_equal(actual[:, 0], expected)
55305529

5531-
accum = np.array([[0.0]])
55325530
actual = np.zeros_like(data)
55335531
actual.fill(np.nan)
5534-
pd.algos.group_cumsum(actual, data, labels, accum)
5532+
pd.algos.group_cumsum(actual, data, labels, is_datetimelike)
55355533
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
55365534
self.assert_numpy_array_equal(actual[:, 0], expected)
55375535

55385536
# timedelta
5537+
is_datetimelike = True
55395538
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
5540-
accum = np.array([[0]], dtype='int64')
55415539
actual = np.zeros_like(data, dtype='int64')
5542-
pd.algos.group_cumsum(actual, data.view('int64'), labels, accum)
5540+
pd.algos.group_cumsum(actual, data.view('int64'), labels,
5541+
is_datetimelike)
55435542
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
55445543
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
55455544
np.timedelta64(5, 'ns')])
@@ -5965,12 +5964,9 @@ def test_cummin_cummax(self):
59655964
df.loc[[2, 6], 'B'] = min_val
59665965
expected.loc[[2, 3, 6, 7], 'B'] = min_val
59675966
result = df.groupby('A').cummin()
5968-
5969-
# TODO: GH 15019
5970-
# overwriting NaNs
5971-
# tm.assert_frame_equal(result, expected)
5967+
tm.assert_frame_equal(result, expected)
59725968
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5973-
# tm.assert_frame_equal(result, expected)
5969+
tm.assert_frame_equal(result, expected)
59745970

59755971
# cummax
59765972
expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
@@ -5983,13 +5979,9 @@ def test_cummin_cummax(self):
59835979
df.loc[[2, 6], 'B'] = max_val
59845980
expected.loc[[2, 3, 6, 7], 'B'] = max_val
59855981
result = df.groupby('A').cummax()
5986-
5987-
# TODO: GH 15019
5988-
# overwriting NaNs
5989-
# tm.assert_frame_equal(result, expected)
5990-
5982+
tm.assert_frame_equal(result, expected)
59915983
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5992-
# tm.assert_frame_equal(result, expected)
5984+
tm.assert_frame_equal(result, expected)
59935985

59945986
# Test nan in some values
59955987
base_df.loc[[0, 2, 4, 6], 'B'] = np.nan

0 commit comments

Comments
 (0)