@@ -62,7 +62,12 @@ cdef enum InterpolationEnumType:
62
62
INTERPOLATION_MIDPOINT
63
63
64
64
65
- cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil:
65
+ cdef float64_t median_linear_mask(
66
+ float64_t* a,
67
+ int n,
68
+ uint8_t* mask,
69
+ bint skipna = True
70
+ ) noexcept nogil:
66
71
cdef:
67
72
int i, j, na_count = 0
68
73
float64_t* tmp
@@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
77
82
na_count += 1
78
83
79
84
if na_count:
80
- if na_count == n:
85
+ if na_count == n or not skipna :
81
86
return NaN
82
87
83
88
tmp = < float64_t* > malloc((n - na_count) * sizeof(float64_t))
@@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
104
109
cdef float64_t median_linear(
105
110
float64_t* a,
106
111
int n,
107
- bint is_datetimelike = False
112
+ bint is_datetimelike = False ,
113
+ bint skipna = True ,
108
114
) noexcept nogil:
109
115
cdef:
110
116
int i, j, na_count = 0
@@ -125,7 +131,7 @@ cdef float64_t median_linear(
125
131
na_count += 1
126
132
127
133
if na_count:
128
- if na_count == n:
134
+ if na_count == n or not skipna :
129
135
return NaN
130
136
131
137
tmp = < float64_t* > malloc((n - na_count) * sizeof(float64_t))
@@ -186,6 +192,7 @@ def group_median_float64(
186
192
const uint8_t[:, :] mask = None ,
187
193
uint8_t[:, ::1] result_mask = None ,
188
194
bint is_datetimelike = False ,
195
+ bint skipna = True ,
189
196
) -> None:
190
197
"""
191
198
Only aggregates on axis = 0
@@ -229,7 +236,7 @@ def group_median_float64(
229
236
230
237
for j in range (ngroups):
231
238
size = _counts[j + 1 ]
232
- result = median_linear_mask(ptr, size, ptr_mask)
239
+ result = median_linear_mask(ptr, size, ptr_mask, skipna )
233
240
out[j, i] = result
234
241
235
242
if result != result:
@@ -244,7 +251,7 @@ def group_median_float64(
244
251
ptr += _counts[0 ]
245
252
for j in range (ngroups):
246
253
size = _counts[j + 1 ]
247
- out[j, i] = median_linear(ptr, size, is_datetimelike)
254
+ out[j, i] = median_linear(ptr, size, is_datetimelike, skipna )
248
255
ptr += size
249
256
250
257
@@ -804,17 +811,18 @@ def group_prod(
804
811
const uint8_t[:, ::1] mask ,
805
812
uint8_t[:, ::1] result_mask = None ,
806
813
Py_ssize_t min_count = 0 ,
814
+ bint skipna = True ,
807
815
) -> None:
808
816
"""
809
817
Only aggregates on axis = 0
810
818
"""
811
819
cdef:
812
820
Py_ssize_t i , j , N , K , lab , ncounts = len (counts)
813
- int64float_t val
821
+ int64float_t val , nan_val
814
822
int64float_t[:, ::1] prodx
815
823
int64_t[:, ::1] nobs
816
824
Py_ssize_t len_values = len (values), len_labels = len (labels)
817
- bint isna_entry , uses_mask = mask is not None
825
+ bint isna_entry , isna_result , uses_mask = mask is not None
818
826
819
827
if len_values != len_labels:
820
828
raise ValueError("len(index ) != len(labels )")
@@ -823,6 +831,7 @@ def group_prod(
823
831
prodx = np.ones((< object > out).shape, dtype = (< object > out).base.dtype)
824
832
825
833
N , K = (< object > values).shape
834
+ nan_val = _get_na_val(< int64float_t> 0 , False )
826
835
827
836
with nogil:
828
837
for i in range(N ):
@@ -836,12 +845,23 @@ def group_prod(
836
845
837
846
if uses_mask:
838
847
isna_entry = mask[i, j]
848
+ isna_result = result_mask[lab, j]
839
849
else :
840
850
isna_entry = _treat_as_na(val, False )
851
+ isna_result = _treat_as_na(prodx[lab, j], False )
852
+
853
+ if not skipna and isna_result:
854
+ # If prod is already NA, no need to update it
855
+ continue
841
856
842
857
if not isna_entry:
843
858
nobs[lab, j] += 1
844
859
prodx[lab, j] *= val
860
+ elif not skipna:
861
+ if uses_mask:
862
+ result_mask[lab, j] = True
863
+ else :
864
+ prodx[lab, j] = nan_val
845
865
846
866
_check_below_mincount(
847
867
out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
@@ -862,14 +882,15 @@ def group_var(
862
882
uint8_t[:, ::1] result_mask = None ,
863
883
bint is_datetimelike = False ,
864
884
str name = " var" ,
885
+ bint skipna = True ,
865
886
) -> None:
866
887
cdef:
867
888
Py_ssize_t i , j , N , K , lab , ncounts = len (counts)
868
889
floating val , ct , oldmean
869
890
floating[:, ::1] mean
870
891
int64_t[:, ::1] nobs
871
892
Py_ssize_t len_values = len (values), len_labels = len (labels)
872
- bint isna_entry , uses_mask = mask is not None
893
+ bint isna_entry , isna_result , uses_mask = mask is not None
873
894
bint is_std = name == " std"
874
895
bint is_sem = name == " sem"
875
896
@@ -898,19 +919,34 @@ def group_var(
898
919
899
920
if uses_mask:
900
921
isna_entry = mask[i, j]
922
+ isna_result = result_mask[lab, j]
901
923
elif is_datetimelike:
902
924
# With group_var, we cannot just use _treat_as_na bc
903
925
# datetimelike dtypes get cast to float64 instead of
904
926
# to int64.
905
927
isna_entry = val == NPY_NAT
928
+ isna_result = out[lab, j] == NPY_NAT
906
929
else :
907
930
isna_entry = _treat_as_na(val, is_datetimelike)
931
+ isna_result = _treat_as_na(out[lab, j], is_datetimelike)
932
+
933
+ if not skipna and isna_result:
934
+ # If aggregate is already NA, don't add to it. This is important for
935
+ # datetimelike because adding a value to NPY_NAT may not result
936
+ # in a NPY_NAT
937
+ continue
908
938
909
939
if not isna_entry:
910
940
nobs[lab, j] += 1
911
941
oldmean = mean[lab, j]
912
942
mean[lab, j] += (val - oldmean) / nobs[lab, j]
913
943
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
944
+ elif not skipna:
945
+ nobs[lab, j] = 0
946
+ if uses_mask:
947
+ result_mask[lab, j] = True
948
+ else :
949
+ out[lab, j] = NAN
914
950
915
951
for i in range (ncounts):
916
952
for j in range (K):
@@ -1164,7 +1200,7 @@ def group_mean(
1164
1200
mean_t[:, ::1] sumx , compensation
1165
1201
int64_t[:, ::1] nobs
1166
1202
Py_ssize_t len_values = len (values), len_labels = len (labels)
1167
- bint isna_entry , uses_mask = mask is not None
1203
+ bint isna_entry , isna_result , uses_mask = mask is not None
1168
1204
1169
1205
assert min_count == -1, "'min_count' only used in sum and prod"
1170
1206
@@ -1194,25 +1230,24 @@ def group_mean(
1194
1230
for j in range (K):
1195
1231
val = values[i, j]
1196
1232
1197
- if not skipna and (
1198
- (uses_mask and result_mask[lab, j]) or
1199
- (is_datetimelike and sumx[lab, j] == NPY_NAT) or
1200
- _treat_as_na(sumx[lab, j], False )
1201
- ):
1202
- # If sum is already NA, don't add to it. This is important for
1203
- # datetimelike because adding a value to NPY_NAT may not result
1204
- # in NPY_NAT
1205
- continue
1206
-
1207
1233
if uses_mask:
1208
1234
isna_entry = mask[i, j]
1235
+ isna_result = result_mask[lab, j]
1209
1236
elif is_datetimelike:
1210
1237
# With group_mean, we cannot just use _treat_as_na bc
1211
1238
# datetimelike dtypes get cast to float64 instead of
1212
1239
# to int64.
1213
1240
isna_entry = val == NPY_NAT
1241
+ isna_result = sumx[lab, j] == NPY_NAT
1214
1242
else :
1215
1243
isna_entry = _treat_as_na(val, is_datetimelike)
1244
+ isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
1245
+
1246
+ if not skipna and isna_result:
1247
+ # If sum is already NA, don't add to it. This is important for
1248
+ # datetimelike because adding a value to NPY_NAT may not result
1249
+ # in NPY_NAT
1250
+ continue
1216
1251
1217
1252
if not isna_entry:
1218
1253
nobs[lab, j] += 1
@@ -1806,6 +1841,7 @@ cdef group_min_max(
1806
1841
bint compute_max = True ,
1807
1842
const uint8_t[:, ::1 ] mask = None ,
1808
1843
uint8_t[:, ::1 ] result_mask = None ,
1844
+ bint skipna = True ,
1809
1845
):
1810
1846
"""
1811
1847
Compute minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1833,6 +1869,8 @@ cdef group_min_max(
1833
1869
result_mask : ndarray[bool, ndim=2], optional
1834
1870
If not None, these specify locations in the output that are NA.
1835
1871
Modified in-place.
1872
+ skipna : bool, default True
1873
+ If True, ignore nans in `values`.
1836
1874
1837
1875
Notes
1838
1876
-----
@@ -1841,17 +1879,18 @@ cdef group_min_max(
1841
1879
"""
1842
1880
cdef:
1843
1881
Py_ssize_t i, j, N, K, lab, ngroups = len (counts)
1844
- numeric_t val
1882
+ numeric_t val, nan_val
1845
1883
numeric_t[:, ::1 ] group_min_or_max
1846
1884
int64_t[:, ::1 ] nobs
1847
1885
bint uses_mask = mask is not None
1848
- bint isna_entry
1886
+ bint isna_entry, isna_result
1849
1887
1850
1888
if not len (values) == len (labels):
1851
1889
raise AssertionError (" len(index) != len(labels)" )
1852
1890
1853
1891
min_count = max (min_count, 1 )
1854
1892
nobs = np.zeros((< object > out).shape, dtype = np.int64)
1893
+ nan_val = _get_na_val(< numeric_t> 0 , is_datetimelike)
1855
1894
1856
1895
group_min_or_max = np.empty_like(out)
1857
1896
group_min_or_max[:] = _get_min_or_max(< numeric_t> 0 , compute_max, is_datetimelike)
@@ -1870,8 +1909,15 @@ cdef group_min_max(
1870
1909
1871
1910
if uses_mask:
1872
1911
isna_entry = mask[i, j]
1912
+ isna_result = result_mask[lab, j]
1873
1913
else :
1874
1914
isna_entry = _treat_as_na(val, is_datetimelike)
1915
+ isna_result = _treat_as_na(group_min_or_max[lab, j],
1916
+ is_datetimelike)
1917
+
1918
+ if not skipna and isna_result:
1919
+ # If current min/max is already NA, it will always be NA
1920
+ continue
1875
1921
1876
1922
if not isna_entry:
1877
1923
nobs[lab, j] += 1
@@ -1881,6 +1927,11 @@ cdef group_min_max(
1881
1927
else :
1882
1928
if val < group_min_or_max[lab, j]:
1883
1929
group_min_or_max[lab, j] = val
1930
+ elif not skipna:
1931
+ if uses_mask:
1932
+ result_mask[lab, j] = True
1933
+ else :
1934
+ group_min_or_max[lab, j] = nan_val
1884
1935
1885
1936
_check_below_mincount(
1886
1937
out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
@@ -2012,6 +2063,7 @@ def group_max(
2012
2063
bint is_datetimelike = False ,
2013
2064
const uint8_t[:, ::1] mask = None ,
2014
2065
uint8_t[:, ::1] result_mask = None ,
2066
+ bint skipna = True ,
2015
2067
) -> None:
2016
2068
"""See group_min_max.__doc__"""
2017
2069
group_min_max(
@@ -2024,6 +2076,7 @@ def group_max(
2024
2076
compute_max = True ,
2025
2077
mask = mask,
2026
2078
result_mask = result_mask,
2079
+ skipna = skipna,
2027
2080
)
2028
2081
2029
2082
@@ -2038,6 +2091,7 @@ def group_min(
2038
2091
bint is_datetimelike = False ,
2039
2092
const uint8_t[:, ::1] mask = None ,
2040
2093
uint8_t[:, ::1] result_mask = None ,
2094
+ bint skipna = True ,
2041
2095
) -> None:
2042
2096
"""See group_min_max.__doc__"""
2043
2097
group_min_max(
@@ -2050,6 +2104,7 @@ def group_min(
2050
2104
compute_max = False ,
2051
2105
mask = mask,
2052
2106
result_mask = result_mask,
2107
+ skipna = skipna,
2053
2108
)
2054
2109
2055
2110
0 commit comments