Skip to content

Commit 21f7a6b

Browse files
committed
REF: more consolidation, cleanup
1 parent 11dcced commit 21f7a6b

File tree

3 files changed

+354
-495
lines changed

3 files changed

+354
-495
lines changed

pandas/lib.pyx

+354-2
Original file line numberDiff line numberDiff line change
@@ -770,9 +770,361 @@ def create_hdf_rows_3d(ndarray index, ndarray columns,
770770

771771
return l
772772

773+
#-------------------------------------------------------------------------------
774+
# Groupby-related functions
775+
776+
@cython.boundscheck(False)
777+
def arrmap(ndarray[object] index, object func):
778+
cdef int length = index.shape[0]
779+
cdef int i = 0
780+
781+
cdef ndarray[object] result = np.empty(length, dtype=np.object_)
782+
783+
for i from 0 <= i < length:
784+
result[i] = func(index[i])
785+
786+
return result
787+
788+
@cython.wraparound(False)
789+
@cython.boundscheck(False)
790+
def is_lexsorted(list list_of_arrays):
791+
cdef:
792+
int i
793+
Py_ssize_t n, nlevels
794+
int64_t k, cur, pre
795+
ndarray arr
796+
797+
nlevels = len(list_of_arrays)
798+
n = len(list_of_arrays[0])
799+
800+
cdef int64_t **vecs = <int64_t**> malloc(nlevels * sizeof(int64_t*))
801+
for i from 0 <= i < nlevels:
802+
# vecs[i] = <int64_t *> (<ndarray> list_of_arrays[i]).data
803+
804+
arr = list_of_arrays[i]
805+
vecs[i] = <int64_t *> arr.data
806+
# assume uniqueness??
807+
808+
for i from 1 <= i < n:
809+
for k from 0 <= k < nlevels:
810+
cur = vecs[k][i]
811+
pre = vecs[k][i-1]
812+
if cur == pre:
813+
continue
814+
elif cur > pre:
815+
break
816+
else:
817+
return False
818+
free(vecs)
819+
return True
820+
821+
822+
823+
# TODO: could do even better if we know something about the data. eg, index has
824+
# 1-min data, binner has 5-min data, then bins are just strides in index. This
825+
# is a general, O(max(len(values), len(binner))) method.
826+
827+
@cython.boundscheck(False)
828+
@cython.wraparound(False)
829+
def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
830+
object closed='left'):
831+
"""
832+
Int64 (datetime64) version of generic python version in groupby.py
833+
"""
834+
cdef:
835+
Py_ssize_t lenidx, lenbin, i, j, bc, vc
836+
ndarray[int64_t] bins
837+
int64_t l_bin, r_bin
838+
bint right_closed = closed == 'right'
839+
840+
lenidx = len(values)
841+
lenbin = len(binner)
842+
843+
if lenidx <= 0 or lenbin <= 0:
844+
raise ValueError("Invalid length for values or for binner")
845+
846+
# check binner fits data
847+
if values[0] < binner[0]:
848+
raise ValueError("Values falls before first bin")
849+
850+
if values[lenidx-1] > binner[lenbin-1]:
851+
raise ValueError("Values falls after last bin")
852+
853+
bins = np.empty(lenbin - 1, dtype=np.int64)
854+
855+
j = 0 # index into values
856+
bc = 0 # bin count
857+
858+
# linear scan
859+
for i in range(0, lenbin - 1):
860+
l_bin = binner[i]
861+
r_bin = binner[i+1]
862+
863+
# count values in current bin, advance to next bin
864+
while j < lenidx and (values[j] < r_bin or
865+
(right_closed and values[j] == r_bin)):
866+
j += 1
867+
868+
bins[bc] = j
869+
bc += 1
870+
871+
return bins
872+
873+
874+
875+
876+
@cython.boundscheck(False)
877+
@cython.wraparound(False)
878+
def row_bool_subset(ndarray[float64_t, ndim=2] values,
879+
ndarray[uint8_t, cast=True] mask):
880+
cdef:
881+
Py_ssize_t i, j, n, k, pos = 0
882+
ndarray[float64_t, ndim=2] out
883+
884+
n, k = (<object> values).shape
885+
assert(n == len(mask))
886+
887+
out = np.empty((mask.sum(), k), dtype=np.float64)
888+
889+
for i in range(n):
890+
if mask[i]:
891+
for j in range(k):
892+
out[pos, j] = values[i, j]
893+
pos += 1
894+
895+
return out
896+
897+
@cython.boundscheck(False)
898+
@cython.wraparound(False)
899+
def row_bool_subset_object(ndarray[object, ndim=2] values,
900+
ndarray[uint8_t, cast=True] mask):
901+
cdef:
902+
Py_ssize_t i, j, n, k, pos = 0
903+
ndarray[object, ndim=2] out
904+
905+
n, k = (<object> values).shape
906+
assert(n == len(mask))
907+
908+
out = np.empty((mask.sum(), k), dtype=object)
909+
910+
for i in range(n):
911+
if mask[i]:
912+
for j in range(k):
913+
out[pos, j] = values[i, j]
914+
pos += 1
915+
916+
return out
917+
918+
919+
def group_count(ndarray[int64_t] values, Py_ssize_t size):
920+
cdef:
921+
Py_ssize_t i, n = len(values)
922+
ndarray[int64_t] counts
923+
924+
counts = np.zeros(size, dtype=np.int64)
925+
for i in range(n):
926+
counts[values[i]] += 1
927+
return counts
928+
929+
def lookup_values(ndarray[object] values, dict mapping):
930+
cdef:
931+
Py_ssize_t i, n = len(values)
932+
933+
result = np.empty(n, dtype='O')
934+
for i in range(n):
935+
result[i] = mapping[values[i]]
936+
return maybe_convert_objects(result)
937+
938+
939+
def count_level_1d(ndarray[uint8_t, cast=True] mask,
940+
ndarray[int64_t] labels, Py_ssize_t max_bin):
941+
cdef:
942+
Py_ssize_t i, n
943+
ndarray[int64_t] counts
944+
945+
counts = np.zeros(max_bin, dtype='i8')
946+
947+
n = len(mask)
948+
949+
for i from 0 <= i < n:
950+
if mask[i]:
951+
counts[labels[i]] += 1
952+
953+
return counts
954+
955+
956+
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
957+
ndarray[int64_t] labels, Py_ssize_t max_bin):
958+
cdef:
959+
Py_ssize_t i, j, k, n
960+
ndarray[int64_t, ndim=2] counts
961+
962+
n, k = (<object> mask).shape
963+
counts = np.zeros((max_bin, k), dtype='i8')
964+
965+
for i from 0 <= i < n:
966+
for j from 0 <= j < k:
967+
if mask[i, j]:
968+
counts[labels[i], j] += 1
969+
970+
return counts
971+
972+
cdef class _PandasNull:
973+
974+
def __richcmp__(_PandasNull self, object other, int op):
975+
if op == 2: # ==
976+
return isinstance(other, _PandasNull)
977+
elif op == 3: # !=
978+
return not isinstance(other, _PandasNull)
979+
else:
980+
return False
981+
982+
def __hash__(self):
983+
return 0
984+
985+
pandas_null = _PandasNull()
986+
987+
def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
988+
'''
989+
For zipping multiple ndarrays into an ndarray of tuples
990+
'''
991+
cdef:
992+
Py_ssize_t i, j, k, n
993+
ndarray[object] result
994+
flatiter it
995+
object val, tup
996+
997+
k = len(ndarrays)
998+
n = len(ndarrays[0])
999+
1000+
result = np.empty(n, dtype=object)
1001+
1002+
# initialize tuples on first pass
1003+
arr = ndarrays[0]
1004+
it = <flatiter> PyArray_IterNew(arr)
1005+
for i in range(n):
1006+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
1007+
tup = PyTuple_New(k)
1008+
1009+
if val != val:
1010+
val = fill_value
1011+
1012+
PyTuple_SET_ITEM(tup, 0, val)
1013+
Py_INCREF(val)
1014+
result[i] = tup
1015+
PyArray_ITER_NEXT(it)
1016+
1017+
for j in range(1, k):
1018+
arr = ndarrays[j]
1019+
it = <flatiter> PyArray_IterNew(arr)
1020+
if len(arr) != n:
1021+
raise ValueError('all arrays must be same length')
1022+
1023+
for i in range(n):
1024+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
1025+
if val != val:
1026+
val = fill_value
1027+
1028+
PyTuple_SET_ITEM(result[i], j, val)
1029+
Py_INCREF(val)
1030+
PyArray_ITER_NEXT(it)
1031+
1032+
return result
1033+
1034+
def duplicated(ndarray[object] values, take_last=False):
1035+
cdef:
1036+
Py_ssize_t i, n
1037+
dict seen = {}
1038+
object row
1039+
1040+
n = len(values)
1041+
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
1042+
1043+
if take_last:
1044+
for i from n > i >= 0:
1045+
row = values[i]
1046+
1047+
if row in seen:
1048+
result[i] = 1
1049+
else:
1050+
seen[row] = None
1051+
result[i] = 0
1052+
else:
1053+
for i from 0 <= i < n:
1054+
row = values[i]
1055+
if row in seen:
1056+
result[i] = 1
1057+
else:
1058+
seen[row] = None
1059+
result[i] = 0
1060+
1061+
return result.view(np.bool_)
1062+
1063+
def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
1064+
cdef:
1065+
Py_ssize_t i, group_size, n, lab, start
1066+
object slobj
1067+
ndarray[int64_t] starts
1068+
1069+
n = len(labels)
1070+
1071+
starts = np.zeros(ngroups, dtype=np.int64)
1072+
ends = np.zeros(ngroups, dtype=np.int64)
1073+
1074+
start = 0
1075+
group_size = 0
1076+
for i in range(n):
1077+
group_size += 1
1078+
lab = labels[i]
1079+
if i == n - 1 or lab != labels[i + 1]:
1080+
starts[lab] = start
1081+
ends[lab] = start + group_size
1082+
start += group_size
1083+
group_size = 0
1084+
1085+
return starts, ends
1086+
1087+
1088+
def indices_fast(object index, ndarray[int64_t] labels, list keys,
1089+
list sorted_labels):
1090+
cdef:
1091+
Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
1092+
dict result = {}
1093+
object tup
1094+
1095+
k = len(keys)
1096+
1097+
if n == 0:
1098+
return result
1099+
1100+
start = 0
1101+
cur = labels[0]
1102+
for i in range(1, n):
1103+
lab = labels[i]
1104+
1105+
if lab != cur:
1106+
if lab != -1:
1107+
tup = PyTuple_New(k)
1108+
for j in range(k):
1109+
val = util.get_value_at(keys[j],
1110+
sorted_labels[j][i-1])
1111+
PyTuple_SET_ITEM(tup, j, val)
1112+
Py_INCREF(val)
1113+
1114+
result[tup] = index[start:i]
1115+
start = i
1116+
cur = lab
1117+
1118+
tup = PyTuple_New(k)
1119+
for j in range(k):
1120+
val = util.get_value_at(keys[j],
1121+
sorted_labels[j][n - 1])
1122+
PyTuple_SET_ITEM(tup, j, val)
1123+
Py_INCREF(val)
1124+
result[tup] = index[start:]
1125+
1126+
return result
7731127

774-
include "groupby.pyx"
775-
include "reindex.pyx"
7761128
include "reduce.pyx"
7771129
include "properties.pyx"
7781130
include "inference.pyx"

0 commit comments

Comments
 (0)