|
1 |
| -from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check |
| 1 | +from cpython cimport (PyObject, Py_INCREF, PyList_Check, PyTuple_Check, |
| 2 | + PyString_AsStringAndSize, PyDict_Copy) |
2 | 3 |
|
3 | 4 | from khash cimport *
|
4 | 5 | from numpy cimport *
|
@@ -843,6 +844,127 @@ cdef class PyObjectHashTable(HashTable):
|
843 | 844 | return labels
|
844 | 845 |
|
845 | 846 |
|
| 847 | +cdef inline cbuf_t to_cbuf(object s): |
| 848 | + cdef cbuf_t output |
| 849 | + PyString_AsStringAndSize(s, <char**>&output.buf, &output.len) |
| 850 | + return output |
| 851 | + |
| 852 | + |
| 853 | +cdef class CBufHashTable(HashTable): |
| 854 | + cdef kh_cbuf_map_t *table |
| 855 | + |
| 856 | + def __cinit__(self, int size_hint=1): |
| 857 | + self.table = kh_init_cbuf_map() |
| 858 | + if size_hint is not None: |
| 859 | + kh_resize_cbuf_map(self.table, size_hint) |
| 860 | + |
| 861 | + def __dealloc__(self): |
| 862 | + kh_destroy_cbuf_map(self.table) |
| 863 | + |
| 864 | + cdef inline int check_type(self, object val): |
| 865 | + return util.is_string_object(val) |
| 866 | + |
| 867 | + cpdef get_item(self, object val): |
| 868 | + cdef khiter_t it |
| 869 | + it = kh_get_cbuf_map(self.table, to_cbuf(val)) |
| 870 | + if it != self.table.n_buckets: |
| 871 | + return self.table.vals[it] |
| 872 | + else: |
| 873 | + raise KeyError(val) |
| 874 | + |
| 875 | + def get_iter_test(self, object key, Py_ssize_t iterations): |
| 876 | + cdef khiter_t it |
| 877 | + cdef Py_ssize_t i, val |
| 878 | + for i in range(iterations): |
| 879 | + it = kh_get_cbuf_map(self.table, to_cbuf(key)) |
| 880 | + if it != self.table.n_buckets: |
| 881 | + val = self.table.vals[it] |
| 882 | + |
| 883 | + cpdef set_item(self, object key, Py_ssize_t val): |
| 884 | + cdef: |
| 885 | + khiter_t it |
| 886 | + int ret = 0 |
| 887 | + cbuf_t buf |
| 888 | + |
| 889 | + buf = to_cbuf(key) |
| 890 | + |
| 891 | + it = kh_put_cbuf_map(self.table, buf, &ret) |
| 892 | + self.table.keys[it] = buf |
| 893 | + if kh_exist_cbuf_map(self.table, it): |
| 894 | + self.table.vals[it] = val |
| 895 | + else: |
| 896 | + raise KeyError(key) |
| 897 | + |
| 898 | + def get_indexer(self, ndarray[object] values): |
| 899 | + cdef: |
| 900 | + Py_ssize_t i, n = len(values) |
| 901 | + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) |
| 902 | + cbuf_t buf |
| 903 | + int64_t[::1] out = labels |
| 904 | + khiter_t it |
| 905 | + kh_cbuf_map_t *table = self.table |
| 906 | + |
| 907 | + for i in range(n): |
| 908 | + buf = to_cbuf(values[i]) |
| 909 | + it = kh_get_cbuf_map(table, buf) |
| 910 | + if it != table.n_buckets: |
| 911 | + out[i] = table.vals[it] |
| 912 | + else: |
| 913 | + out[i] = -1 |
| 914 | + return labels |
| 915 | + |
| 916 | + def unique(self, ndarray[object] values): |
| 917 | + cdef: |
| 918 | + Py_ssize_t i, n = len(values) |
| 919 | + Py_ssize_t idx, count = 0 |
| 920 | + int ret = 0 |
| 921 | + object val |
| 922 | + cbuf_t buf |
| 923 | + khiter_t it |
| 924 | + ObjectVector uniques = ObjectVector() |
| 925 | + |
| 926 | + for i in range(n): |
| 927 | + val = values[i] |
| 928 | + buf = to_cbuf(val) |
| 929 | + it = kh_get_cbuf_map(self.table, buf) |
| 930 | + if it == self.table.n_buckets: |
| 931 | + it = kh_put_cbuf_map(self.table, buf, &ret) |
| 932 | + count += 1 |
| 933 | + uniques.append(val) |
| 934 | + |
| 935 | + return uniques.to_array() |
| 936 | + |
| 937 | + def factorize(self, ndarray[object] values): |
| 938 | + cdef: |
| 939 | + Py_ssize_t i, n = len(values) |
| 940 | + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) |
| 941 | + list reverse = [] |
| 942 | + Py_ssize_t idx, count = 0 |
| 943 | + int ret = 0 |
| 944 | + object val |
| 945 | + cbuf_t buf |
| 946 | + khiter_t it |
| 947 | + |
| 948 | + for i in range(n): |
| 949 | + val = values[i] |
| 950 | + buf = to_cbuf(val) |
| 951 | + it = kh_get_cbuf_map(self.table, buf) |
| 952 | + if it != self.table.n_buckets: |
| 953 | + idx = self.table.vals[it] |
| 954 | + labels[i] = idx |
| 955 | + else: |
| 956 | + it = kh_put_cbuf_map(self.table, buf, &ret) |
| 957 | + |
| 958 | + self.table.vals[it] = count |
| 959 | + reverse.append(val) |
| 960 | + labels[i] = count |
| 961 | + count += 1 |
| 962 | + |
| 963 | + return PyDict_Copy(enumerate(reverse)), labels |
| 964 | + |
| 965 | + |
| 966 | + |
| 967 | + |
846 | 968 | cdef class Factorizer:
|
847 | 969 | cdef public PyObjectHashTable table
|
848 | 970 | cdef public ObjectVector uniques
|
|
0 commit comments