Skip to content

Commit 1d78502

Browse files
committed
ENH: use Vector classes in hash tables. attempt to fix clang build issues #2188
1 parent 5702b2a commit 1d78502

File tree

6 files changed

+90
-125
lines changed

6 files changed

+90
-125
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ pandas 0.10.0
6969
(#2295)
7070
- Respect dtype=object in DataFrame constructor (#2291)
7171
- Fix DatetimeIndex.join bug with tz-aware indexes and how='outer' (#2317)
72+
- pop(...) and del works with DataFrame with duplicate columns (#2349)
7273

7374
pandas 0.9.1
7475
============

pandas/core/algorithms.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,16 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
117117
"""
118118
values = np.asarray(values)
119119
is_datetime = com.is_datetime64_dtype(values)
120-
hash_klass, values = _get_data_algo(values, _hashtables)
120+
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
121121

122-
uniques = []
123122
table = hash_klass(len(values))
123+
uniques = vec_klass()
124124
labels, counts = table.get_labels(values, uniques, 0, na_sentinel)
125125

126126
labels = com._ensure_platform_int(labels)
127127

128-
uniques = com._asarray_tuplesafe(uniques)
128+
uniques = uniques.to_array(xfer_data=True)
129+
129130
if sort and len(counts) > 0:
130131
sorter = uniques.argsort()
131132
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
@@ -325,7 +326,7 @@ def group_position(*args):
325326
}
326327

327328
_hashtables = {
328-
'float64': lib.Float64HashTable,
329-
'int64': lib.Int64HashTable,
330-
'generic': lib.PyObjectHashTable
329+
'float64': (lib.Float64HashTable, lib.Float64Vector),
330+
'int64': (lib.Int64HashTable, lib.Int64Vector),
331+
'generic': (lib.PyObjectHashTable, lib.ObjectVector)
331332
}

pandas/src/hashtable.pyx

+69-116
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,18 @@ cdef extern from "kvec.h":
3535
size_t n, m
3636
int64_t* a
3737

38+
ctypedef struct kv_double:
39+
size_t n, m
40+
double* a
41+
3842
ctypedef struct kv_object_t:
3943
size_t n, m
4044
PyObject** a
4145

4246
inline void kv_object_push(kv_object_t *v, PyObject* x)
4347
inline void kv_object_destroy(kv_object_t *v)
4448
inline void kv_int64_push(kv_int64_t *v, int64_t x)
49+
inline void kv_double_push(kv_double *v, double x)
4550

4651

4752
cdef class ObjectVector:
@@ -53,6 +58,9 @@ cdef class ObjectVector:
5358
def __cinit__(self):
5459
self.owndata = 1
5560

61+
def __len__(self):
62+
return self.vec.n
63+
5664
def to_array(self, xfer_data=True):
5765
""" Here we use the __array__ method, that is called when numpy
5866
tries to get an array from the object."""
@@ -68,9 +76,10 @@ cdef class ObjectVector:
6876

6977
# urgh, mingw32 barfs because of this
7078

71-
# if xfer_data:
72-
# self.owndata = 0
73-
# util.set_array_owndata(result)
79+
if xfer_data:
80+
self.owndata = 0
81+
util.set_array_owndata(result)
82+
7483
# return result
7584

7685
return result.copy()
@@ -92,6 +101,9 @@ cdef class Int64Vector:
92101
def __cinit__(self):
93102
self.owndata = 1
94103

104+
def __len__(self):
105+
return self.vec.n
106+
95107
def to_array(self, xfer_data=True):
96108
""" Here we use the __array__ method, that is called when numpy
97109
tries to get an array from the object."""
@@ -118,6 +130,44 @@ cdef class Int64Vector:
118130
if self.owndata:
119131
free(self.vec.a)
120132

133+
cdef class Float64Vector:
134+
135+
cdef:
136+
bint owndata
137+
kv_double vec
138+
139+
def __cinit__(self):
140+
self.owndata = 1
141+
142+
def __len__(self):
143+
return self.vec.n
144+
145+
def to_array(self, xfer_data=True):
146+
""" Here we use the __array__ method, that is called when numpy
147+
tries to get an array from the object."""
148+
cdef:
149+
npy_intp shape[1]
150+
ndarray result
151+
152+
shape[0] = <npy_intp> self.vec.n
153+
154+
# Create a 1D array, of length 'size'
155+
result = PyArray_SimpleNewFromData(1, shape, np.NPY_FLOAT64,
156+
self.vec.a)
157+
158+
if xfer_data:
159+
self.owndata = 0
160+
util.set_array_owndata(result)
161+
162+
return result
163+
164+
cdef inline append(self, float64_t x):
165+
kv_double_push(&self.vec, x)
166+
167+
def __dealloc__(self):
168+
if self.owndata:
169+
free(self.vec.a)
170+
121171

122172
cdef class HashTable:
123173
pass
@@ -197,7 +247,7 @@ cdef class StringHashTable(HashTable):
197247
object val
198248
char *buf
199249
khiter_t k
200-
list uniques = []
250+
ObjectVector uniques = ObjectVector()
201251

202252
for i in range(n):
203253
val = values[i]
@@ -212,7 +262,7 @@ cdef class StringHashTable(HashTable):
212262
uniques.append(val)
213263

214264
# return None
215-
return uniques
265+
return uniques.to_array(xfer_data=True)
216266

217267
def factorize(self, ndarray[object] values):
218268
cdef:
@@ -471,7 +521,7 @@ cdef class Int64HashTable(HashTable):
471521
labels, counts = self.get_labels(values, reverse, 0)
472522
return reverse, labels, counts
473523

474-
def get_labels(self, ndarray[int64_t] values, list uniques,
524+
def get_labels(self, ndarray[int64_t] values, Int64Vector uniques,
475525
Py_ssize_t count_prior, Py_ssize_t na_sentinel):
476526
cdef:
477527
Py_ssize_t i, n = len(values)
@@ -570,7 +620,6 @@ def value_count_int64(ndarray[int64_t] values):
570620
Py_ssize_t i, n = len(values)
571621
kh_int64_t *table
572622
int ret = 0
573-
list uniques = []
574623

575624
table = kh_init_int64()
576625
kh_resize_int64(table, n)
@@ -615,11 +664,12 @@ cdef class Float64HashTable(HashTable):
615664
kh_destroy_float64(self.table)
616665

617666
def factorize(self, ndarray[float64_t] values):
618-
uniques = []
667+
uniques = Float64Vector()
619668
labels, counts = self.get_labels(values, uniques, 0, -1)
620-
return uniques, labels, counts
669+
return uniques.to_array(xfer_data=True), labels, counts
621670

622-
cpdef get_labels(self, ndarray[float64_t] values, list uniques,
671+
cpdef get_labels(self, ndarray[float64_t] values,
672+
Float64Vector uniques,
623673
Py_ssize_t count_prior, int64_t na_sentinel):
624674
cdef:
625675
Py_ssize_t i, n = len(values)
@@ -690,7 +740,7 @@ cdef class Float64HashTable(HashTable):
690740
int ret = 0
691741
float64_t val
692742
khiter_t k
693-
list uniques = []
743+
Float64Vector uniques = Float64Vector()
694744
bint seen_na = 0
695745

696746
# TODO: kvec
@@ -708,7 +758,7 @@ cdef class Float64HashTable(HashTable):
708758
seen_na = 1
709759
uniques.append(ONAN)
710760

711-
return uniques
761+
return uniques.to_array(xfer_data=True)
712762

713763
cdef class PyObjectHashTable(HashTable):
714764
cdef kh_pymap_t *table
@@ -842,7 +892,7 @@ cdef class PyObjectHashTable(HashTable):
842892

843893
return result
844894

845-
cpdef get_labels(self, ndarray[object] values, list uniques,
895+
cpdef get_labels(self, ndarray[object] values, ObjectVector uniques,
846896
Py_ssize_t count_prior, int64_t na_sentinel):
847897
cdef:
848898
Py_ssize_t i, n = len(values)
@@ -882,12 +932,12 @@ cdef class PyObjectHashTable(HashTable):
882932

883933
cdef class Factorizer:
884934
cdef public PyObjectHashTable table
885-
cdef public uniques
935+
cdef public ObjectVector uniques
886936
cdef public Py_ssize_t count
887937

888938
def __init__(self, size_hint):
889939
self.table = PyObjectHashTable(size_hint)
890-
self.uniques = []
940+
self.uniques = ObjectVector()
891941
self.count = 0
892942

893943
def get_count(self):
@@ -902,7 +952,7 @@ cdef class Factorizer:
902952
if labels.dtype != np.int_:
903953
labels = labels.astype(np.int_)
904954

905-
sorter = list_to_object_array(self.uniques).argsort()
955+
sorter = self.uniques.to_array(xfer_data=False).argsort()
906956
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
907957
reverse_indexer.put(sorter, np.arange(len(sorter)))
908958

@@ -919,12 +969,12 @@ cdef class Factorizer:
919969

920970
cdef class Int64Factorizer:
921971
cdef public Int64HashTable table
922-
cdef public list uniques
972+
cdef public Int64Vector uniques
923973
cdef public Py_ssize_t count
924974

925975
def __init__(self, size_hint):
926976
self.table = Int64HashTable(size_hint)
927-
self.uniques = []
977+
self.uniques = Int64Vector()
928978
self.count = 0
929979

930980
def get_count(self):
@@ -940,7 +990,7 @@ cdef class Int64Factorizer:
940990
if labels.dtype != np.int_:
941991
labels = labels.astype(np.int_)
942992

943-
sorter = list_to_object_array(self.uniques).argsort()
993+
sorter = self.uniques.to_array(xfer_data=False).argsort()
944994
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
945995
reverse_indexer.put(sorter, np.arange(len(sorter)))
946996

@@ -951,103 +1001,6 @@ cdef class Int64Factorizer:
9511001
return labels, counts
9521002

9531003

954-
cdef class DictFactorizer:
955-
956-
cdef public:
957-
dict table
958-
list uniques
959-
Py_ssize_t count
960-
961-
def __init__(self, table=None, uniques=None):
962-
if table is None:
963-
self.table = {}
964-
else:
965-
self.table = table
966-
967-
if uniques is None:
968-
self.uniques = []
969-
self.count = 0
970-
else:
971-
self.uniques = uniques
972-
self.count = len(uniques)
973-
974-
def get_count(self):
975-
return self.count
976-
977-
def get_labels(self, ndarray[object] values):
978-
cdef:
979-
Py_ssize_t i, n = len(values)
980-
ndarray[int64_t] labels
981-
ndarray[int64_t] counts
982-
Py_ssize_t idx, count = self.count
983-
int ret = 0
984-
object val
985-
986-
labels = np.empty(n, dtype=np.int64)
987-
counts = np.empty(count + n, dtype=np.int64)
988-
989-
for i in range(n):
990-
val = values[i]
991-
992-
if val in self.table:
993-
idx = self.table[val]
994-
labels[i] = idx
995-
counts[idx] = counts[idx] + 1
996-
else:
997-
self.table[val] = count
998-
self.uniques.append(val)
999-
labels[i] = count
1000-
counts[count] = 1
1001-
count += 1
1002-
1003-
return labels, counts[:count].copy()
1004-
1005-
def factorize(self, ndarray[object] values, sort=False):
1006-
labels, counts = self.get_labels(values)
1007-
1008-
# sort on
1009-
if sort:
1010-
if labels.dtype != np.int_:
1011-
labels = labels.astype(np.int_)
1012-
1013-
sorter = list_to_object_array(self.uniques).argsort()
1014-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
1015-
reverse_indexer.put(sorter, np.arange(len(sorter)))
1016-
1017-
labels = reverse_indexer.take(labels)
1018-
counts = counts.take(sorter)
1019-
1020-
self.count = len(counts)
1021-
return labels, counts
1022-
1023-
def unique(self, ndarray[object] values):
1024-
cdef:
1025-
Py_ssize_t i, n = len(values)
1026-
Py_ssize_t idx, count = self.count
1027-
object val
1028-
1029-
for i in range(n):
1030-
val = values[i]
1031-
if val not in self.table:
1032-
self.table[val] = count
1033-
self.uniques.append(val)
1034-
count += 1
1035-
return self.uniques
1036-
1037-
1038-
def unique_int64(self, ndarray[int64_t] values):
1039-
cdef:
1040-
Py_ssize_t i, n = len(values)
1041-
Py_ssize_t idx, count = self.count
1042-
int64_t val
1043-
1044-
for i in range(n):
1045-
val = values[i]
1046-
if val not in self.table:
1047-
self.table[val] = count
1048-
self.uniques.append(val)
1049-
count += 1
1050-
return self.uniques
10511004

10521005
def lookup2(ndarray[object] values):
10531006
cdef:

pandas/src/klib/kvec.h

+9
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@ typedef struct {
108108
int64_t* a;
109109
} kv_int64_t;
110110

111+
typedef struct {
112+
size_t n, m;
113+
double* a;
114+
} kv_double;
115+
111116
typedef struct {
112117
size_t n, m;
113118
PyObject** a;
@@ -129,6 +134,10 @@ void PANDAS_INLINE kv_int64_push(kv_int64_t *v, int64_t x) {
129134
kv_push(int64_t, v, x);
130135
}
131136

137+
void PANDAS_INLINE kv_double_push(kv_double *v, double x) {
138+
kv_push(double, v, x);
139+
}
140+
132141
void PANDAS_INLINE kv_object_destroy(kv_object_t *v) {
133142
int i;
134143
for (i = 0; i < v->n; ++i)

0 commit comments

Comments
 (0)