19
19
is_categorical_dtype ,
20
20
is_datetimelike ,
21
21
is_datetime_or_timedelta_dtype ,
22
+ is_datetime64_any_dtype ,
22
23
is_bool , is_integer_dtype ,
23
24
is_complex_dtype ,
24
25
is_bool_dtype ,
@@ -108,10 +109,12 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
108
109
@Substitution (name = 'groupby' , f = name )
109
110
@Appender (_doc_template )
110
111
@Appender (_local_template )
111
- def f (self ):
112
+ def f (self , ** kwargs ):
113
+ if 'numeric_only' not in kwargs :
114
+ kwargs ['numeric_only' ] = numeric_only
112
115
self ._set_group_selection ()
113
116
try :
114
- return self ._cython_agg_general (alias , numeric_only = numeric_only )
117
+ return self ._cython_agg_general (alias , alt = npfunc , ** kwargs )
115
118
except AssertionError as e :
116
119
raise SpecificationError (str (e ))
117
120
except Exception :
@@ -126,7 +129,9 @@ def f(self):
126
129
127
130
128
131
def _first_compat (x , axis = 0 ):
132
+
129
133
def _first (x ):
134
+
130
135
x = np .asarray (x )
131
136
x = x [notnull (x )]
132
137
if len (x ) == 0 :
@@ -141,6 +146,7 @@ def _first(x):
141
146
142
147
def _last_compat (x , axis = 0 ):
143
148
def _last (x ):
149
+
144
150
x = np .asarray (x )
145
151
x = x [notnull (x )]
146
152
if len (x ) == 0 :
@@ -774,14 +780,16 @@ def _try_cast(self, result, obj):
774
780
return result
775
781
776
782
def _cython_transform (self , how , numeric_only = True ):
777
- output = {}
783
+ output = collections . OrderedDict ()
778
784
for name , obj in self ._iterate_slices ():
779
785
is_numeric = is_numeric_dtype (obj .dtype )
780
786
if numeric_only and not is_numeric :
781
787
continue
782
788
783
789
try :
784
790
result , names = self .grouper .transform (obj .values , how )
791
+ except NotImplementedError :
792
+ continue
785
793
except AssertionError as e :
786
794
raise GroupByError (str (e ))
787
795
output [name ] = self ._try_cast (result , obj )
@@ -791,7 +799,7 @@ def _cython_transform(self, how, numeric_only=True):
791
799
792
800
return self ._wrap_transformed_output (output , names )
793
801
794
- def _cython_agg_general (self , how , numeric_only = True ):
802
+ def _cython_agg_general (self , how , alt = None , numeric_only = True ):
795
803
output = {}
796
804
for name , obj in self ._iterate_slices ():
797
805
is_numeric = is_numeric_dtype (obj .dtype )
@@ -1014,26 +1022,26 @@ def mean(self, *args, **kwargs):
1014
1022
1015
1023
For multiple groupings, the result index will be a MultiIndex
1016
1024
"""
1017
- nv .validate_groupby_func ('mean' , args , kwargs )
1025
+ nv .validate_groupby_func ('mean' , args , kwargs , [ 'numeric_only' ] )
1018
1026
try :
1019
- return self ._cython_agg_general ('mean' )
1027
+ return self ._cython_agg_general ('mean' , ** kwargs )
1020
1028
except GroupByError :
1021
1029
raise
1022
1030
except Exception : # pragma: no cover
1023
1031
self ._set_group_selection ()
1024
- f = lambda x : x .mean (axis = self .axis )
1032
+ f = lambda x : x .mean (axis = self .axis , ** kwargs )
1025
1033
return self ._python_agg_general (f )
1026
1034
1027
1035
@Substitution (name = 'groupby' )
1028
1036
@Appender (_doc_template )
1029
- def median (self ):
1037
+ def median (self , ** kwargs ):
1030
1038
"""
1031
1039
Compute median of groups, excluding missing values
1032
1040
1033
1041
For multiple groupings, the result index will be a MultiIndex
1034
1042
"""
1035
1043
try :
1036
- return self ._cython_agg_general ('median' )
1044
+ return self ._cython_agg_general ('median' , ** kwargs )
1037
1045
except GroupByError :
1038
1046
raise
1039
1047
except Exception : # pragma: no cover
@@ -1043,7 +1051,7 @@ def median(self):
1043
1051
def f (x ):
1044
1052
if isinstance (x , np .ndarray ):
1045
1053
x = Series (x )
1046
- return x .median (axis = self .axis )
1054
+ return x .median (axis = self .axis , ** kwargs )
1047
1055
return self ._python_agg_general (f )
1048
1056
1049
1057
@Substitution (name = 'groupby' )
@@ -1062,7 +1070,7 @@ def std(self, ddof=1, *args, **kwargs):
1062
1070
1063
1071
# TODO: implement at Cython level?
1064
1072
nv .validate_groupby_func ('std' , args , kwargs )
1065
- return np .sqrt (self .var (ddof = ddof ))
1073
+ return np .sqrt (self .var (ddof = ddof , ** kwargs ))
1066
1074
1067
1075
@Substitution (name = 'groupby' )
1068
1076
@Appender (_doc_template )
@@ -1079,10 +1087,10 @@ def var(self, ddof=1, *args, **kwargs):
1079
1087
"""
1080
1088
nv .validate_groupby_func ('var' , args , kwargs )
1081
1089
if ddof == 1 :
1082
- return self ._cython_agg_general ('var' )
1090
+ return self ._cython_agg_general ('var' , ** kwargs )
1083
1091
else :
1084
1092
self ._set_group_selection ()
1085
- f = lambda x : x .var (ddof = ddof )
1093
+ f = lambda x : x .var (ddof = ddof , ** kwargs )
1086
1094
return self ._python_agg_general (f )
1087
1095
1088
1096
@Substitution (name = 'groupby' )
@@ -1399,21 +1407,21 @@ def cumcount(self, ascending=True):
1399
1407
@Appender (_doc_template )
1400
1408
def cumprod (self , axis = 0 , * args , ** kwargs ):
1401
1409
"""Cumulative product for each group"""
1402
- nv .validate_groupby_func ('cumprod' , args , kwargs )
1410
+ nv .validate_groupby_func ('cumprod' , args , kwargs , [ 'numeric_only' ] )
1403
1411
if axis != 0 :
1404
- return self .apply (lambda x : x .cumprod (axis = axis ))
1412
+ return self .apply (lambda x : x .cumprod (axis = axis , ** kwargs ))
1405
1413
1406
- return self ._cython_transform ('cumprod' )
1414
+ return self ._cython_transform ('cumprod' , ** kwargs )
1407
1415
1408
1416
@Substitution (name = 'groupby' )
1409
1417
@Appender (_doc_template )
1410
1418
def cumsum (self , axis = 0 , * args , ** kwargs ):
1411
1419
"""Cumulative sum for each group"""
1412
- nv .validate_groupby_func ('cumsum' , args , kwargs )
1420
+ nv .validate_groupby_func ('cumsum' , args , kwargs , [ 'numeric_only' ] )
1413
1421
if axis != 0 :
1414
- return self .apply (lambda x : x .cumsum (axis = axis ))
1422
+ return self .apply (lambda x : x .cumsum (axis = axis , ** kwargs ))
1415
1423
1416
- return self ._cython_transform ('cumsum' )
1424
+ return self ._cython_transform ('cumsum' , ** kwargs )
1417
1425
1418
1426
@Substitution (name = 'groupby' )
1419
1427
@Appender (_doc_template )
@@ -1807,6 +1815,28 @@ def wrapper(*args, **kwargs):
1807
1815
def _cython_operation (self , kind , values , how , axis ):
1808
1816
assert kind in ['transform' , 'aggregate' ]
1809
1817
1818
+ # can we do this operation with our cython functions
1819
+ # if not raise NotImplementedError
1820
+
1821
+ # we raise NotImplemented if this is an invalid operation
1822
+ # entirely, e.g. adding datetimes
1823
+
1824
+ # categoricals are only 1d, so we
1825
+ # are not setup for dim transforming
1826
+ if is_categorical_dtype (values ):
1827
+ raise NotImplementedError (
1828
+ "categoricals are not support in cython ops ATM" )
1829
+ elif is_datetime64_any_dtype (values ):
1830
+ if how in ['add' , 'prod' , 'cumsum' , 'cumprod' ]:
1831
+ raise NotImplementedError (
1832
+ "datetime64 type does not support {} "
1833
+ "operations" .format (how ))
1834
+ elif is_timedelta64_dtype (values ):
1835
+ if how in ['prod' , 'cumprod' ]:
1836
+ raise NotImplementedError (
1837
+ "timedelta64 type does not support {} "
1838
+ "operations" .format (how ))
1839
+
1810
1840
arity = self ._cython_arity .get (how , 1 )
1811
1841
1812
1842
vdim = values .ndim
@@ -3134,9 +3164,9 @@ def _iterate_slices(self):
3134
3164
continue
3135
3165
yield val , slicer (val )
3136
3166
3137
- def _cython_agg_general (self , how , numeric_only = True ):
3167
+ def _cython_agg_general (self , how , alt = None , numeric_only = True ):
3138
3168
new_items , new_blocks = self ._cython_agg_blocks (
3139
- how , numeric_only = numeric_only )
3169
+ how , alt = alt , numeric_only = numeric_only )
3140
3170
return self ._wrap_agged_blocks (new_items , new_blocks )
3141
3171
3142
3172
def _wrap_agged_blocks (self , items , blocks ):
@@ -3162,29 +3192,75 @@ def _wrap_agged_blocks(self, items, blocks):
3162
3192
3163
3193
_block_agg_axis = 0
3164
3194
3165
- def _cython_agg_blocks (self , how , numeric_only = True ):
3166
- data , agg_axis = self ._get_data_to_aggregate ()
3195
+ def _cython_agg_blocks (self , how , alt = None , numeric_only = True ):
3196
+ # TODO: the actual managing of mgr_locs is a PITA
3197
+ # here, it should happen via BlockManager.combine
3167
3198
3168
- new_blocks = []
3199
+ data , agg_axis = self . _get_data_to_aggregate ()
3169
3200
3170
3201
if numeric_only :
3171
3202
data = data .get_numeric_data (copy = False )
3172
3203
3204
+ new_blocks = []
3205
+ new_items = []
3206
+ deleted_items = []
3173
3207
for block in data .blocks :
3174
3208
3175
- result , _ = self .grouper .aggregate (
3176
- block .values , how , axis = agg_axis )
3209
+ locs = block .mgr_locs .as_array
3210
+ try :
3211
+ result , _ = self .grouper .aggregate (
3212
+ block .values , how , axis = agg_axis )
3213
+ except NotImplementedError :
3214
+ # generally if we have numeric_only=False
3215
+ # and non-applicable functions
3216
+ # try to python agg
3217
+
3218
+ if alt is None :
3219
+ # we cannot perform the operation
3220
+ # in an alternate way, exclude the block
3221
+ deleted_items .append (locs )
3222
+ continue
3223
+
3224
+ # call our grouper again with only this block
3225
+ obj = self .obj [data .items [locs ]]
3226
+ s = groupby (obj , self .grouper )
3227
+ result = s .aggregate (lambda x : alt (x , axis = self .axis ))
3228
+ result = result ._data .blocks [0 ]
3177
3229
3178
3230
# see if we can cast the block back to the original dtype
3179
3231
result = block ._try_coerce_and_cast_result (result )
3180
3232
3181
- newb = make_block (result , placement = block .mgr_locs )
3233
+ new_items .append (locs )
3234
+ newb = block .make_block_same_class (result )
3182
3235
new_blocks .append (newb )
3183
3236
3184
3237
if len (new_blocks ) == 0 :
3185
3238
raise DataError ('No numeric types to aggregate' )
3186
3239
3187
- return data .items , new_blocks
3240
+ # reset the locs in the blocks to correspond to our
3241
+ # current ordering
3242
+ indexer = np .concatenate (new_items )
3243
+ new_items = data .items .take (np .sort (indexer ))
3244
+
3245
+ if len (deleted_items ):
3246
+
3247
+ # we need to adjust the indexer to account for the
3248
+ # items we have removed
3249
+ # really should be done in internals :<
3250
+
3251
+ deleted = np .concatenate (deleted_items )
3252
+ ai = np .arange (len (data ))
3253
+ mask = np .zeros (len (data ))
3254
+ mask [deleted ] = 1
3255
+ indexer = (ai - mask .cumsum ())[indexer ]
3256
+
3257
+ offset = 0
3258
+ for b in new_blocks :
3259
+ l = len (b .mgr_locs )
3260
+ b .mgr_locs = indexer [offset :(offset + l )]
3261
+ offset += l
3262
+
3263
+ return new_items , new_blocks
3188
3264
3189
3265
def _get_data_to_aggregate (self ):
3190
3266
obj = self ._obj_with_exclusions
0 commit comments