@@ -770,9 +770,361 @@ def create_hdf_rows_3d(ndarray index, ndarray columns,
770
770
771
771
return l
772
772
773
+ # -------------------------------------------------------------------------------
774
+ # Groupby-related functions
775
+
776
+ @ cython.boundscheck (False )
777
+ def arrmap (ndarray[object] index , object func ):
778
+ cdef int length = index.shape[0 ]
779
+ cdef int i = 0
780
+
781
+ cdef ndarray[object ] result = np.empty(length, dtype = np.object_)
782
+
783
+ for i from 0 <= i < length:
784
+ result[i] = func(index[i])
785
+
786
+ return result
787
+
788
+ @ cython.wraparound (False )
789
+ @ cython.boundscheck (False )
790
+ def is_lexsorted (list list_of_arrays ):
791
+ cdef:
792
+ int i
793
+ Py_ssize_t n, nlevels
794
+ int64_t k, cur, pre
795
+ ndarray arr
796
+
797
+ nlevels = len (list_of_arrays)
798
+ n = len (list_of_arrays[0 ])
799
+
800
+ cdef int64_t ** vecs = < int64_t** > malloc(nlevels * sizeof(int64_t* ))
801
+ for i from 0 <= i < nlevels:
802
+ # vecs[i] = <int64_t *> (<ndarray> list_of_arrays[i]).data
803
+
804
+ arr = list_of_arrays[i]
805
+ vecs[i] = < int64_t * > arr.data
806
+ # assume uniqueness??
807
+
808
+ for i from 1 <= i < n:
809
+ for k from 0 <= k < nlevels:
810
+ cur = vecs[k][i]
811
+ pre = vecs[k][i- 1 ]
812
+ if cur == pre:
813
+ continue
814
+ elif cur > pre:
815
+ break
816
+ else :
817
+ return False
818
+ free(vecs)
819
+ return True
820
+
821
+
822
+
823
+ # TODO: could do even better if we know something about the data. eg, index has
824
+ # 1-min data, binner has 5-min data, then bins are just strides in index. This
825
+ # is a general, O(max(len(values), len(binner))) method.
826
+
827
+ @ cython.boundscheck (False )
828
+ @ cython.wraparound (False )
829
+ def generate_bins_dt64 (ndarray[int64_t] values , ndarray[int64_t] binner ,
830
+ object closed = ' left' ):
831
+ """
832
+ Int64 (datetime64) version of generic python version in groupby.py
833
+ """
834
+ cdef:
835
+ Py_ssize_t lenidx, lenbin, i, j, bc, vc
836
+ ndarray[int64_t] bins
837
+ int64_t l_bin, r_bin
838
+ bint right_closed = closed == ' right'
839
+
840
+ lenidx = len (values)
841
+ lenbin = len (binner)
842
+
843
+ if lenidx <= 0 or lenbin <= 0 :
844
+ raise ValueError (" Invalid length for values or for binner" )
845
+
846
+ # check binner fits data
847
+ if values[0 ] < binner[0 ]:
848
+ raise ValueError (" Values falls before first bin" )
849
+
850
+ if values[lenidx- 1 ] > binner[lenbin- 1 ]:
851
+ raise ValueError (" Values falls after last bin" )
852
+
853
+ bins = np.empty(lenbin - 1 , dtype = np.int64)
854
+
855
+ j = 0 # index into values
856
+ bc = 0 # bin count
857
+
858
+ # linear scan
859
+ for i in range (0 , lenbin - 1 ):
860
+ l_bin = binner[i]
861
+ r_bin = binner[i+ 1 ]
862
+
863
+ # count values in current bin, advance to next bin
864
+ while j < lenidx and (values[j] < r_bin or
865
+ (right_closed and values[j] == r_bin)):
866
+ j += 1
867
+
868
+ bins[bc] = j
869
+ bc += 1
870
+
871
+ return bins
872
+
873
+
874
+
875
+
876
+ @ cython.boundscheck (False )
877
+ @ cython.wraparound (False )
878
+ def row_bool_subset (ndarray[float64_t , ndim = 2 ] values,
879
+ ndarray[uint8_t , cast = True ] mask):
880
+ cdef:
881
+ Py_ssize_t i, j, n, k, pos = 0
882
+ ndarray[float64_t, ndim= 2 ] out
883
+
884
+ n, k = (< object > values).shape
885
+ assert (n == len (mask))
886
+
887
+ out = np.empty((mask.sum(), k), dtype = np.float64)
888
+
889
+ for i in range (n):
890
+ if mask[i]:
891
+ for j in range (k):
892
+ out[pos, j] = values[i, j]
893
+ pos += 1
894
+
895
+ return out
896
+
897
+ @ cython.boundscheck (False )
898
+ @ cython.wraparound (False )
899
+ def row_bool_subset_object (ndarray[object , ndim = 2 ] values,
900
+ ndarray[uint8_t , cast = True ] mask):
901
+ cdef:
902
+ Py_ssize_t i, j, n, k, pos = 0
903
+ ndarray[object , ndim= 2 ] out
904
+
905
+ n, k = (< object > values).shape
906
+ assert (n == len (mask))
907
+
908
+ out = np.empty((mask.sum(), k), dtype = object )
909
+
910
+ for i in range (n):
911
+ if mask[i]:
912
+ for j in range (k):
913
+ out[pos, j] = values[i, j]
914
+ pos += 1
915
+
916
+ return out
917
+
918
+
919
+ def group_count (ndarray[int64_t] values , Py_ssize_t size ):
920
+ cdef:
921
+ Py_ssize_t i, n = len (values)
922
+ ndarray[int64_t] counts
923
+
924
+ counts = np.zeros(size, dtype = np.int64)
925
+ for i in range (n):
926
+ counts[values[i]] += 1
927
+ return counts
928
+
929
+ def lookup_values (ndarray[object] values , dict mapping ):
930
+ cdef:
931
+ Py_ssize_t i, n = len (values)
932
+
933
+ result = np.empty(n, dtype = ' O' )
934
+ for i in range (n):
935
+ result[i] = mapping[values[i]]
936
+ return maybe_convert_objects(result)
937
+
938
+
939
+ def count_level_1d (ndarray[uint8_t , cast = True ] mask,
940
+ ndarray[int64_t] labels , Py_ssize_t max_bin ):
941
+ cdef:
942
+ Py_ssize_t i, n
943
+ ndarray[int64_t] counts
944
+
945
+ counts = np.zeros(max_bin, dtype = ' i8' )
946
+
947
+ n = len (mask)
948
+
949
+ for i from 0 <= i < n:
950
+ if mask[i]:
951
+ counts[labels[i]] += 1
952
+
953
+ return counts
954
+
955
+
956
+ def count_level_2d (ndarray[uint8_t , ndim = 2 , cast = True ] mask,
957
+ ndarray[int64_t] labels , Py_ssize_t max_bin ):
958
+ cdef:
959
+ Py_ssize_t i, j, k, n
960
+ ndarray[int64_t, ndim= 2 ] counts
961
+
962
+ n, k = (< object > mask).shape
963
+ counts = np.zeros((max_bin, k), dtype = ' i8' )
964
+
965
+ for i from 0 <= i < n:
966
+ for j from 0 <= j < k:
967
+ if mask[i, j]:
968
+ counts[labels[i], j] += 1
969
+
970
+ return counts
971
+
972
+ cdef class _PandasNull:
973
+
974
+ def __richcmp__ (_PandasNull self , object other , int op ):
975
+ if op == 2 : # ==
976
+ return isinstance (other, _PandasNull)
977
+ elif op == 3 : # !=
978
+ return not isinstance (other, _PandasNull)
979
+ else :
980
+ return False
981
+
982
+ def __hash__ (self ):
983
+ return 0
984
+
985
+ pandas_null = _PandasNull()
986
+
987
+ def fast_zip_fillna (list ndarrays , fill_value = pandas_null):
988
+ '''
989
+ For zipping multiple ndarrays into an ndarray of tuples
990
+ '''
991
+ cdef:
992
+ Py_ssize_t i, j, k, n
993
+ ndarray[object ] result
994
+ flatiter it
995
+ object val, tup
996
+
997
+ k = len (ndarrays)
998
+ n = len (ndarrays[0 ])
999
+
1000
+ result = np.empty(n, dtype = object )
1001
+
1002
+ # initialize tuples on first pass
1003
+ arr = ndarrays[0 ]
1004
+ it = < flatiter> PyArray_IterNew(arr)
1005
+ for i in range (n):
1006
+ val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
1007
+ tup = PyTuple_New(k)
1008
+
1009
+ if val != val:
1010
+ val = fill_value
1011
+
1012
+ PyTuple_SET_ITEM(tup, 0 , val)
1013
+ Py_INCREF(val)
1014
+ result[i] = tup
1015
+ PyArray_ITER_NEXT(it)
1016
+
1017
+ for j in range (1 , k):
1018
+ arr = ndarrays[j]
1019
+ it = < flatiter> PyArray_IterNew(arr)
1020
+ if len (arr) != n:
1021
+ raise ValueError (' all arrays must be same length' )
1022
+
1023
+ for i in range (n):
1024
+ val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
1025
+ if val != val:
1026
+ val = fill_value
1027
+
1028
+ PyTuple_SET_ITEM(result[i], j, val)
1029
+ Py_INCREF(val)
1030
+ PyArray_ITER_NEXT(it)
1031
+
1032
+ return result
1033
+
1034
+ def duplicated (ndarray[object] values , take_last = False ):
1035
+ cdef:
1036
+ Py_ssize_t i, n
1037
+ dict seen = {}
1038
+ object row
1039
+
1040
+ n = len (values)
1041
+ cdef ndarray[uint8_t] result = np.zeros(n, dtype = np.uint8)
1042
+
1043
+ if take_last:
1044
+ for i from n > i >= 0 :
1045
+ row = values[i]
1046
+
1047
+ if row in seen:
1048
+ result[i] = 1
1049
+ else :
1050
+ seen[row] = None
1051
+ result[i] = 0
1052
+ else :
1053
+ for i from 0 <= i < n:
1054
+ row = values[i]
1055
+ if row in seen:
1056
+ result[i] = 1
1057
+ else :
1058
+ seen[row] = None
1059
+ result[i] = 0
1060
+
1061
+ return result.view(np.bool_)
1062
+
1063
+ def generate_slices (ndarray[int64_t] labels , Py_ssize_t ngroups ):
1064
+ cdef:
1065
+ Py_ssize_t i, group_size, n, lab, start
1066
+ object slobj
1067
+ ndarray[int64_t] starts
1068
+
1069
+ n = len (labels)
1070
+
1071
+ starts = np.zeros(ngroups, dtype = np.int64)
1072
+ ends = np.zeros(ngroups, dtype = np.int64)
1073
+
1074
+ start = 0
1075
+ group_size = 0
1076
+ for i in range (n):
1077
+ group_size += 1
1078
+ lab = labels[i]
1079
+ if i == n - 1 or lab != labels[i + 1 ]:
1080
+ starts[lab] = start
1081
+ ends[lab] = start + group_size
1082
+ start += group_size
1083
+ group_size = 0
1084
+
1085
+ return starts, ends
1086
+
1087
+
1088
+ def indices_fast (object index , ndarray[int64_t] labels , list keys ,
1089
+ list sorted_labels ):
1090
+ cdef:
1091
+ Py_ssize_t i, j, k, lab, cur, start, n = len (labels)
1092
+ dict result = {}
1093
+ object tup
1094
+
1095
+ k = len (keys)
1096
+
1097
+ if n == 0 :
1098
+ return result
1099
+
1100
+ start = 0
1101
+ cur = labels[0 ]
1102
+ for i in range (1 , n):
1103
+ lab = labels[i]
1104
+
1105
+ if lab != cur:
1106
+ if lab != - 1 :
1107
+ tup = PyTuple_New(k)
1108
+ for j in range (k):
1109
+ val = util.get_value_at(keys[j],
1110
+ sorted_labels[j][i- 1 ])
1111
+ PyTuple_SET_ITEM(tup, j, val)
1112
+ Py_INCREF(val)
1113
+
1114
+ result[tup] = index[start:i]
1115
+ start = i
1116
+ cur = lab
1117
+
1118
+ tup = PyTuple_New(k)
1119
+ for j in range (k):
1120
+ val = util.get_value_at(keys[j],
1121
+ sorted_labels[j][n - 1 ])
1122
+ PyTuple_SET_ITEM(tup, j, val)
1123
+ Py_INCREF(val)
1124
+ result[tup] = index[start:]
1125
+
1126
+ return result
773
1127
774
- include " groupby.pyx"
775
- include " reindex.pyx"
776
1128
include " reduce.pyx"
777
1129
include " properties.pyx"
778
1130
include " inference.pyx"
0 commit comments