Skip to content

Commit b390fdb

Browse files
committed
BUG: cummin/cummax nans inf values for int64
migrate accum into cython move datetime check into cython mask int64 inf values before entering cython functions Edit whatsnew and alter condition add missing methods edit space
1 parent be3f2ae commit b390fdb

File tree

4 files changed

+47
-41
lines changed

4 files changed

+47
-41
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ Performance Improvements
365365
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
366366
- Improved performance of timeseries plotting with an irregular DatetimeIndex
367367
(or with ``compat_x=True``) (:issue:`15073`).
368-
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
368+
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048` & `15109`)
369369

370370
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
371371

pandas/core/groupby.py

+21-13
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
is_bool_dtype,
2626
is_scalar,
2727
is_list_like,
28+
needs_i8_conversion,
2829
_ensure_float64,
2930
_ensure_platform_int,
3031
_ensure_int64,
@@ -1874,15 +1875,21 @@ def _cython_operation(self, kind, values, how, axis):
18741875
"supported for the 'how' argument")
18751876
out_shape = (self.ngroups,) + values.shape[1:]
18761877

1878+
is_datetimelike = needs_i8_conversion(values.dtype)
18771879
is_numeric = is_numeric_dtype(values.dtype)
18781880

1879-
if is_datetime_or_timedelta_dtype(values.dtype):
1881+
if is_datetimelike:
18801882
values = values.view('int64')
18811883
is_numeric = True
18821884
elif is_bool_dtype(values.dtype):
18831885
values = _ensure_float64(values)
18841886
elif is_integer_dtype(values):
1885-
values = values.astype('int64', copy=False)
1887+
# we use iNaT for the missing value on ints
1888+
# so pre-convert to guard this condition
1889+
if (values == tslib.iNaT).any():
1890+
values = _ensure_float64(values)
1891+
else:
1892+
values = values.astype('int64', copy=False)
18861893
elif is_numeric and not is_complex_dtype(values):
18871894
values = _ensure_float64(values)
18881895
else:
@@ -1911,20 +1918,20 @@ def _cython_operation(self, kind, values, how, axis):
19111918
fill_value=np.nan)
19121919
counts = np.zeros(self.ngroups, dtype=np.int64)
19131920
result = self._aggregate(
1914-
result, counts, values, labels, func, is_numeric)
1921+
result, counts, values, labels, func, is_numeric,
1922+
is_datetimelike)
19151923
elif kind == 'transform':
19161924
result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
19171925
fill_value=np.nan)
19181926

1919-
# temporary storange for running-total type tranforms
1920-
accum = np.empty(out_shape, dtype=out_dtype)
19211927
result = self._transform(
1922-
result, accum, values, labels, func, is_numeric)
1928+
result, values, labels, func, is_numeric, is_datetimelike)
19231929

19241930
if is_integer_dtype(result):
1925-
if len(result[result == tslib.iNaT]) > 0:
1931+
mask = result == tslib.iNaT
1932+
if mask.any():
19261933
result = result.astype('float64')
1927-
result[result == tslib.iNaT] = np.nan
1934+
result[mask] = np.nan
19281935

19291936
if kind == 'aggregate' and \
19301937
self._filter_empty_groups and not counts.all():
@@ -1960,7 +1967,7 @@ def transform(self, values, how, axis=0):
19601967
return self._cython_operation('transform', values, how, axis)
19611968

19621969
def _aggregate(self, result, counts, values, comp_ids, agg_func,
1963-
is_numeric):
1970+
is_numeric, is_datetimelike):
19641971
if values.ndim > 3:
19651972
# punting for now
19661973
raise NotImplementedError("number of dimensions is currently "
@@ -1975,8 +1982,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func,
19751982

19761983
return result
19771984

1978-
def _transform(self, result, accum, values, comp_ids, transform_func,
1979-
is_numeric):
1985+
def _transform(self, result, values, comp_ids, transform_func,
1986+
is_numeric, is_datetimelike):
1987+
19801988
comp_ids, _, ngroups = self.group_info
19811989
if values.ndim > 3:
19821990
# punting for now
@@ -1987,9 +1995,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func,
19871995

19881996
chunk = chunk.squeeze()
19891997
transform_func(result[:, :, i], values,
1990-
comp_ids, accum)
1998+
comp_ids, is_datetimelike)
19911999
else:
1992-
transform_func(result, values, comp_ids, accum)
2000+
transform_func(result, values, comp_ids, is_datetimelike)
19932001

19942002
return result
19952003

pandas/src/algos_groupby_helper.pxi.in

+14-8
Original file line numberDiff line numberDiff line change
@@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
574574
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
575575
ndarray[{{dest_type2}}, ndim=2] values,
576576
ndarray[int64_t] labels,
577-
ndarray[{{dest_type2}}, ndim=2] accum):
577+
bint is_datetimelike):
578578
"""
579579
Only transforms on axis=0
580580
"""
581581
cdef:
582582
Py_ssize_t i, j, N, K, size
583583
{{dest_type2}} val, min_val = 0
584+
ndarray[{{dest_type2}}, ndim=2] accum
584585
int64_t lab
585586

586587
N, K = (<object> values).shape
588+
accum = np.empty_like(values)
587589
accum.fill({{inf_val}})
588590

589591
with nogil:
@@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
600602
accum[lab, j] = min_val
601603
out[i, j] = accum[lab, j]
602604
# val = nan
603-
else:
605+
elif is_datetimelike:
604606
out[i, j] = {{nan_val}}
605607

606608

@@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
609611
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
610612
ndarray[{{dest_type2}}, ndim=2] values,
611613
ndarray[int64_t] labels,
612-
ndarray[{{dest_type2}}, ndim=2] accum):
614+
bint is_datetimelike):
613615
"""
614616
Only transforms on axis=0
615617
"""
616618
cdef:
617619
Py_ssize_t i, j, N, K, size
618620
{{dest_type2}} val, max_val = 0
621+
ndarray[{{dest_type2}}, ndim=2] accum
619622
int64_t lab
620623

621624
N, K = (<object> values).shape
625+
accum = np.empty_like(values)
622626
accum.fill(-{{inf_val}})
623627

624628
with nogil:
@@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
635639
accum[lab, j] = max_val
636640
out[i, j] = accum[lab, j]
637641
# val = nan
638-
else:
642+
elif is_datetimelike:
639643
out[i, j] = {{nan_val}}
640644

641645
{{endfor}}
@@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
682686
def group_cumprod_float64(float64_t[:, :] out,
683687
float64_t[:, :] values,
684688
int64_t[:] labels,
685-
float64_t[:, :] accum):
689+
bint is_datetimelike):
686690
"""
687691
Only transforms on axis=0
688692
"""
689693
cdef:
690694
Py_ssize_t i, j, N, K, size
691695
float64_t val
696+
float64_t[:, :] accum
692697
int64_t lab
693698

694699
N, K = (<object> values).shape
695-
accum = np.ones_like(accum)
700+
accum = np.ones_like(values)
696701

697702
with nogil:
698703
for i in range(N):
@@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out,
712717
def group_cumsum(numeric[:, :] out,
713718
numeric[:, :] values,
714719
int64_t[:] labels,
715-
numeric[:, :] accum):
720+
is_datetimelike):
716721
"""
717722
Only transforms on axis=0
718723
"""
719724
cdef:
720725
Py_ssize_t i, j, N, K, size
721726
numeric val
727+
numeric[:, :] accum
722728
int64_t lab
723729

724730
N, K = (<object> values).shape
725-
accum = np.zeros_like(accum)
731+
accum = np.zeros_like(values)
726732

727733
with nogil:
728734
for i in range(N):

pandas/tests/groupby/test_groupby.py

+11-19
Original file line numberDiff line numberDiff line change
@@ -5504,39 +5504,38 @@ def test_cython_group_transform_algos(self):
55045504
ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]),
55055505
(pd.algos.group_cumsum, np.cumsum, dtypes)]
55065506

5507+
is_datetimelike = False
55075508
for pd_op, np_op, dtypes in ops:
55085509
for dtype in dtypes:
55095510
data = np.array([[1], [2], [3], [4]], dtype=dtype)
55105511
ans = np.zeros_like(data)
5511-
accum = np.array([[0]], dtype=dtype)
55125512
labels = np.array([0, 0, 0, 0], dtype=np.int64)
5513-
pd_op(ans, data, labels, accum)
5513+
pd_op(ans, data, labels, is_datetimelike)
55145514
self.assert_numpy_array_equal(np_op(data), ans[:, 0],
55155515
check_dtype=False)
55165516

55175517
# with nans
55185518
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
55195519

55205520
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
5521-
accum = np.array([[0.0]])
55225521
actual = np.zeros_like(data)
55235522
actual.fill(np.nan)
5524-
pd.algos.group_cumprod_float64(actual, data, labels, accum)
5523+
pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
55255524
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
55265525
self.assert_numpy_array_equal(actual[:, 0], expected)
55275526

5528-
accum = np.array([[0.0]])
55295527
actual = np.zeros_like(data)
55305528
actual.fill(np.nan)
5531-
pd.algos.group_cumsum(actual, data, labels, accum)
5529+
pd.algos.group_cumsum(actual, data, labels, is_datetimelike)
55325530
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
55335531
self.assert_numpy_array_equal(actual[:, 0], expected)
55345532

55355533
# timedelta
5534+
is_datetimelike = True
55365535
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
5537-
accum = np.array([[0]], dtype='int64')
55385536
actual = np.zeros_like(data, dtype='int64')
5539-
pd.algos.group_cumsum(actual, data.view('int64'), labels, accum)
5537+
pd.algos.group_cumsum(actual, data.view('int64'), labels,
5538+
is_datetimelike)
55405539
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
55415540
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
55425541
np.timedelta64(5, 'ns')])
@@ -5962,12 +5961,9 @@ def test_cummin_cummax(self):
59625961
df.loc[[2, 6], 'B'] = min_val
59635962
expected.loc[[2, 3, 6, 7], 'B'] = min_val
59645963
result = df.groupby('A').cummin()
5965-
5966-
# TODO: GH 15019
5967-
# overwriting NaNs
5968-
# tm.assert_frame_equal(result, expected)
5964+
tm.assert_frame_equal(result, expected)
59695965
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5970-
# tm.assert_frame_equal(result, expected)
5966+
tm.assert_frame_equal(result, expected)
59715967

59725968
# cummax
59735969
expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
@@ -5980,13 +5976,9 @@ def test_cummin_cummax(self):
59805976
df.loc[[2, 6], 'B'] = max_val
59815977
expected.loc[[2, 3, 6, 7], 'B'] = max_val
59825978
result = df.groupby('A').cummax()
5983-
5984-
# TODO: GH 15019
5985-
# overwriting NaNs
5986-
# tm.assert_frame_equal(result, expected)
5987-
5979+
tm.assert_frame_equal(result, expected)
59885980
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5989-
# tm.assert_frame_equal(result, expected)
5981+
tm.assert_frame_equal(result, expected)
59905982

59915983
# Test nan in some values
59925984
base_df.loc[[0, 2, 4, 6], 'B'] = np.nan

0 commit comments

Comments
 (0)