19
19
is_categorical_dtype ,
20
20
is_datetimelike ,
21
21
is_datetime_or_timedelta_dtype ,
22
+ is_datetime64_any_dtype ,
22
23
is_bool , is_integer_dtype ,
23
24
is_complex_dtype ,
24
25
is_bool_dtype ,
@@ -109,10 +110,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
109
110
@Substitution (name = 'groupby' , f = name )
110
111
@Appender (_doc_template )
111
112
@Appender (_local_template )
112
- def f (self ):
113
+ def f (self , ** kwargs ):
114
+ if 'numeric_only' not in kwargs :
115
+ kwargs ['numeric_only' ] = numeric_only
113
116
self ._set_group_selection ()
114
117
try :
115
- return self ._cython_agg_general (alias , numeric_only = numeric_only )
118
+ return self ._cython_agg_general (alias , alt = npfunc , ** kwargs )
116
119
except AssertionError as e :
117
120
raise SpecificationError (str (e ))
118
121
except Exception :
@@ -127,7 +130,9 @@ def f(self):
127
130
128
131
129
132
def _first_compat (x , axis = 0 ):
133
+
130
134
def _first (x ):
135
+
131
136
x = np .asarray (x )
132
137
x = x [notnull (x )]
133
138
if len (x ) == 0 :
@@ -142,6 +147,7 @@ def _first(x):
142
147
143
148
def _last_compat (x , axis = 0 ):
144
149
def _last (x ):
150
+
145
151
x = np .asarray (x )
146
152
x = x [notnull (x )]
147
153
if len (x ) == 0 :
@@ -775,14 +781,16 @@ def _try_cast(self, result, obj):
775
781
return result
776
782
777
783
def _cython_transform (self , how , numeric_only = True ):
778
- output = {}
784
+ output = collections . OrderedDict ()
779
785
for name , obj in self ._iterate_slices ():
780
786
is_numeric = is_numeric_dtype (obj .dtype )
781
787
if numeric_only and not is_numeric :
782
788
continue
783
789
784
790
try :
785
791
result , names = self .grouper .transform (obj .values , how )
792
+ except NotImplementedError :
793
+ continue
786
794
except AssertionError as e :
787
795
raise GroupByError (str (e ))
788
796
output [name ] = self ._try_cast (result , obj )
@@ -792,7 +800,7 @@ def _cython_transform(self, how, numeric_only=True):
792
800
793
801
return self ._wrap_transformed_output (output , names )
794
802
795
- def _cython_agg_general (self , how , numeric_only = True ):
803
+ def _cython_agg_general (self , how , alt = None , numeric_only = True ):
796
804
output = {}
797
805
for name , obj in self ._iterate_slices ():
798
806
is_numeric = is_numeric_dtype (obj .dtype )
@@ -1015,26 +1023,26 @@ def mean(self, *args, **kwargs):
1015
1023
1016
1024
For multiple groupings, the result index will be a MultiIndex
1017
1025
"""
1018
- nv .validate_groupby_func ('mean' , args , kwargs )
1026
+ nv .validate_groupby_func ('mean' , args , kwargs , [ 'numeric_only' ] )
1019
1027
try :
1020
- return self ._cython_agg_general ('mean' )
1028
+ return self ._cython_agg_general ('mean' , ** kwargs )
1021
1029
except GroupByError :
1022
1030
raise
1023
1031
except Exception : # pragma: no cover
1024
1032
self ._set_group_selection ()
1025
- f = lambda x : x .mean (axis = self .axis )
1033
+ f = lambda x : x .mean (axis = self .axis , ** kwargs )
1026
1034
return self ._python_agg_general (f )
1027
1035
1028
1036
@Substitution (name = 'groupby' )
1029
1037
@Appender (_doc_template )
1030
- def median (self ):
1038
+ def median (self , ** kwargs ):
1031
1039
"""
1032
1040
Compute median of groups, excluding missing values
1033
1041
1034
1042
For multiple groupings, the result index will be a MultiIndex
1035
1043
"""
1036
1044
try :
1037
- return self ._cython_agg_general ('median' )
1045
+ return self ._cython_agg_general ('median' , ** kwargs )
1038
1046
except GroupByError :
1039
1047
raise
1040
1048
except Exception : # pragma: no cover
@@ -1044,7 +1052,7 @@ def median(self):
1044
1052
def f (x ):
1045
1053
if isinstance (x , np .ndarray ):
1046
1054
x = Series (x )
1047
- return x .median (axis = self .axis )
1055
+ return x .median (axis = self .axis , ** kwargs )
1048
1056
return self ._python_agg_general (f )
1049
1057
1050
1058
@Substitution (name = 'groupby' )
@@ -1063,7 +1071,7 @@ def std(self, ddof=1, *args, **kwargs):
1063
1071
1064
1072
# TODO: implement at Cython level?
1065
1073
nv .validate_groupby_func ('std' , args , kwargs )
1066
- return np .sqrt (self .var (ddof = ddof ))
1074
+ return np .sqrt (self .var (ddof = ddof , ** kwargs ))
1067
1075
1068
1076
@Substitution (name = 'groupby' )
1069
1077
@Appender (_doc_template )
@@ -1080,10 +1088,10 @@ def var(self, ddof=1, *args, **kwargs):
1080
1088
"""
1081
1089
nv .validate_groupby_func ('var' , args , kwargs )
1082
1090
if ddof == 1 :
1083
- return self ._cython_agg_general ('var' )
1091
+ return self ._cython_agg_general ('var' , ** kwargs )
1084
1092
else :
1085
1093
self ._set_group_selection ()
1086
- f = lambda x : x .var (ddof = ddof )
1094
+ f = lambda x : x .var (ddof = ddof , ** kwargs )
1087
1095
return self ._python_agg_general (f )
1088
1096
1089
1097
@Substitution (name = 'groupby' )
@@ -1400,39 +1408,39 @@ def cumcount(self, ascending=True):
1400
1408
@Appender (_doc_template )
1401
1409
def cumprod (self , axis = 0 , * args , ** kwargs ):
1402
1410
"""Cumulative product for each group"""
1403
- nv .validate_groupby_func ('cumprod' , args , kwargs )
1411
+ nv .validate_groupby_func ('cumprod' , args , kwargs , [ 'numeric_only' ] )
1404
1412
if axis != 0 :
1405
- return self .apply (lambda x : x .cumprod (axis = axis ))
1413
+ return self .apply (lambda x : x .cumprod (axis = axis , ** kwargs ))
1406
1414
1407
- return self ._cython_transform ('cumprod' )
1415
+ return self ._cython_transform ('cumprod' , ** kwargs )
1408
1416
1409
1417
@Substitution (name = 'groupby' )
1410
1418
@Appender (_doc_template )
1411
1419
def cumsum (self , axis = 0 , * args , ** kwargs ):
1412
1420
"""Cumulative sum for each group"""
1413
- nv .validate_groupby_func ('cumsum' , args , kwargs )
1421
+ nv .validate_groupby_func ('cumsum' , args , kwargs , [ 'numeric_only' ] )
1414
1422
if axis != 0 :
1415
- return self .apply (lambda x : x .cumsum (axis = axis ))
1423
+ return self .apply (lambda x : x .cumsum (axis = axis , ** kwargs ))
1416
1424
1417
- return self ._cython_transform ('cumsum' )
1425
+ return self ._cython_transform ('cumsum' , ** kwargs )
1418
1426
1419
1427
@Substitution (name = 'groupby' )
1420
1428
@Appender (_doc_template )
1421
- def cummin (self , axis = 0 ):
1429
+ def cummin (self , axis = 0 , ** kwargs ):
1422
1430
"""Cumulative min for each group"""
1423
1431
if axis != 0 :
1424
1432
return self .apply (lambda x : np .minimum .accumulate (x , axis ))
1425
1433
1426
- return self ._cython_transform ('cummin' )
1434
+ return self ._cython_transform ('cummin' , ** kwargs )
1427
1435
1428
1436
@Substitution (name = 'groupby' )
1429
1437
@Appender (_doc_template )
1430
- def cummax (self , axis = 0 ):
1438
+ def cummax (self , axis = 0 , ** kwargs ):
1431
1439
"""Cumulative max for each group"""
1432
1440
if axis != 0 :
1433
1441
return self .apply (lambda x : np .maximum .accumulate (x , axis ))
1434
1442
1435
- return self ._cython_transform ('cummax' )
1443
+ return self ._cython_transform ('cummax' , ** kwargs )
1436
1444
1437
1445
@Substitution (name = 'groupby' )
1438
1446
@Appender (_doc_template )
@@ -1828,6 +1836,28 @@ def wrapper(*args, **kwargs):
1828
1836
def _cython_operation (self , kind , values , how , axis ):
1829
1837
assert kind in ['transform' , 'aggregate' ]
1830
1838
1839
+ # can we do this operation with our cython functions
1840
+ # if not raise NotImplementedError
1841
+
1842
+ # we raise NotImplemented if this is an invalid operation
1843
+ # entirely, e.g. adding datetimes
1844
+
1845
+ # categoricals are only 1d, so we
1846
+ # are not setup for dim transforming
1847
+ if is_categorical_dtype (values ):
1848
+ raise NotImplementedError (
1849
+ "categoricals are not support in cython ops ATM" )
1850
+ elif is_datetime64_any_dtype (values ):
1851
+ if how in ['add' , 'prod' , 'cumsum' , 'cumprod' ]:
1852
+ raise NotImplementedError (
1853
+ "datetime64 type does not support {} "
1854
+ "operations" .format (how ))
1855
+ elif is_timedelta64_dtype (values ):
1856
+ if how in ['prod' , 'cumprod' ]:
1857
+ raise NotImplementedError (
1858
+ "timedelta64 type does not support {} "
1859
+ "operations" .format (how ))
1860
+
1831
1861
arity = self ._cython_arity .get (how , 1 )
1832
1862
1833
1863
vdim = values .ndim
@@ -3155,9 +3185,9 @@ def _iterate_slices(self):
3155
3185
continue
3156
3186
yield val , slicer (val )
3157
3187
3158
- def _cython_agg_general (self , how , numeric_only = True ):
3188
+ def _cython_agg_general (self , how , alt = None , numeric_only = True ):
3159
3189
new_items , new_blocks = self ._cython_agg_blocks (
3160
- how , numeric_only = numeric_only )
3190
+ how , alt = alt , numeric_only = numeric_only )
3161
3191
return self ._wrap_agged_blocks (new_items , new_blocks )
3162
3192
3163
3193
def _wrap_agged_blocks (self , items , blocks ):
@@ -3183,29 +3213,75 @@ def _wrap_agged_blocks(self, items, blocks):
3183
3213
3184
3214
_block_agg_axis = 0
3185
3215
3186
- def _cython_agg_blocks (self , how , numeric_only = True ):
3187
- data , agg_axis = self ._get_data_to_aggregate ()
3216
+ def _cython_agg_blocks (self , how , alt = None , numeric_only = True ):
3217
+ # TODO: the actual managing of mgr_locs is a PITA
3218
+ # here, it should happen via BlockManager.combine
3188
3219
3189
- new_blocks = []
3220
+ data , agg_axis = self . _get_data_to_aggregate ()
3190
3221
3191
3222
if numeric_only :
3192
3223
data = data .get_numeric_data (copy = False )
3193
3224
3225
+ new_blocks = []
3226
+ new_items = []
3227
+ deleted_items = []
3194
3228
for block in data .blocks :
3195
3229
3196
- result , _ = self .grouper .aggregate (
3197
- block .values , how , axis = agg_axis )
3230
+ locs = block .mgr_locs .as_array
3231
+ try :
3232
+ result , _ = self .grouper .aggregate (
3233
+ block .values , how , axis = agg_axis )
3234
+ except NotImplementedError :
3235
+ # generally if we have numeric_only=False
3236
+ # and non-applicable functions
3237
+ # try to python agg
3238
+
3239
+ if alt is None :
3240
+ # we cannot perform the operation
3241
+ # in an alternate way, exclude the block
3242
+ deleted_items .append (locs )
3243
+ continue
3244
+
3245
+ # call our grouper again with only this block
3246
+ obj = self .obj [data .items [locs ]]
3247
+ s = groupby (obj , self .grouper )
3248
+ result = s .aggregate (lambda x : alt (x , axis = self .axis ))
3249
+ result = result ._data .blocks [0 ]
3198
3250
3199
3251
# see if we can cast the block back to the original dtype
3200
3252
result = block ._try_coerce_and_cast_result (result )
3201
3253
3202
- newb = make_block (result , placement = block .mgr_locs )
3254
+ new_items .append (locs )
3255
+ newb = block .make_block_same_class (result )
3203
3256
new_blocks .append (newb )
3204
3257
3205
3258
if len (new_blocks ) == 0 :
3206
3259
raise DataError ('No numeric types to aggregate' )
3207
3260
3208
- return data .items , new_blocks
3261
+ # reset the locs in the blocks to correspond to our
3262
+ # current ordering
3263
+ indexer = np .concatenate (new_items )
3264
+ new_items = data .items .take (np .sort (indexer ))
3265
+
3266
+ if len (deleted_items ):
3267
+
3268
+ # we need to adjust the indexer to account for the
3269
+ # items we have removed
3270
+ # really should be done in internals :<
3271
+
3272
+ deleted = np .concatenate (deleted_items )
3273
+ ai = np .arange (len (data ))
3274
+ mask = np .zeros (len (data ))
3275
+ mask [deleted ] = 1
3276
+ indexer = (ai - mask .cumsum ())[indexer ]
3277
+
3278
+ offset = 0
3279
+ for b in new_blocks :
3280
+ l = len (b .mgr_locs )
3281
+ b .mgr_locs = indexer [offset :(offset + l )]
3282
+ offset += l
3283
+
3284
+ return new_items , new_blocks
3209
3285
3210
3286
def _get_data_to_aggregate (self ):
3211
3287
obj = self ._obj_with_exclusions
0 commit comments