pandas-dev · lithomas1 · Mar 6, 2023 · Mar 12, 2023 · Mar 14, 2023 · Mar 14, 2023
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -194,15 +194,22 @@ def time_clip(self):
 
 
 class ValueCounts:
-    params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]]
+    params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object", "string"]]
     param_names = ["N", "dtype"]
 
     def setup(self, N, dtype):
-        self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype)
+        ser = Series(np.random.randint(0, N, size=10 * N))
+        if dtype == "string":
+            self.s = tm.makeStringIndex(N)
+        else:
+            self.s = ser.astype(dtype)
 
     def time_value_counts(self, N, dtype):
         self.s.value_counts()
 
+    def peakmem_value_counts(self, N, dtype):
+        self.s.value_counts()
+
 
 class ValueCountsEA:
     params = [[10**3, 10**4, 10**5], [True, False]]

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -116,6 +116,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+- Performance improvement in :meth:`Series.value_counts`, :meth:`DataFrame.value_counts`, and :meth:`Index.value_counts` for strings (:issue:`14860`)
 - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
 - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
 - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -21,6 +21,7 @@ cnp.import_array()
 
 
 from pandas._libs cimport util
+from pandas._libs import lib
 from pandas._libs.dtypes cimport numeric_object_t
 from pandas._libs.khash cimport (
     KHASH_TRACE_DOMAIN,

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1269,7 +1269,7 @@ cdef class PyObjectHashTable(HashTable):
             khiter_t k
         hash(key)
 
-        k = kh_get_pymap(self.table, <PyObject*>key)
+        k = kh_get_pymap(self.table, key)
         return k != self.table.n_buckets
 
     def sizeof(self, deep: bool = False) -> int:
@@ -1296,7 +1296,7 @@ cdef class PyObjectHashTable(HashTable):
         cdef:
             khiter_t k
 
-        k = kh_get_pymap(self.table, <PyObject*>val)
+        k = kh_get_pymap(self.table, val)
         if k != self.table.n_buckets:
             return self.table.vals[k]
         else:
@@ -1310,7 +1310,7 @@ cdef class PyObjectHashTable(HashTable):
 
         hash(key)
 
-        k = kh_put_pymap(self.table, <PyObject*>key, &ret)
+        k = kh_put_pymap(self.table, key, &ret)
         if kh_exist_pymap(self.table, k):
             self.table.vals[k] = val
         else:
@@ -1328,7 +1328,7 @@ cdef class PyObjectHashTable(HashTable):
             val = values[i]
             hash(val)
 
-            k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+            k = kh_put_pymap(self.table, val, &ret)
             self.table.vals[k] = i
 
     def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
@@ -1345,7 +1345,7 @@ cdef class PyObjectHashTable(HashTable):
             val = values[i]
             hash(val)
 
-            k = kh_get_pymap(self.table, <PyObject*>val)
+            k = kh_get_pymap(self.table, val)
             if k != self.table.n_buckets:
                 locs[i] = self.table.vals[k]
             else:
@@ -1418,10 +1418,10 @@ cdef class PyObjectHashTable(HashTable):
                 labels[i] = na_sentinel
                 continue
 
-            k = kh_get_pymap(self.table, <PyObject*>val)
+            k = kh_get_pymap(self.table, val)
             if k == self.table.n_buckets:
                 # k hasn't been seen yet
-                k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+                k = kh_put_pymap(self.table, val, &ret)
                 uniques.append(val)
                 if return_inverse:
                     self.table.vals[k] = count

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -6,34 +6,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 {{py:
 
-# name, dtype, ttype, c_type, to_c_type
-dtypes = [('Complex128', 'complex128', 'complex128',
+# name, vector_name, dtype, ttype, c_type, to_c_type
+dtypes = [('complex128', 'Complex128', 'complex128', 'complex128',
                          'khcomplex128_t', 'to_khcomplex128_t'),
-          ('Complex64', 'complex64', 'complex64',
+          ('complex64', 'Complex64', 'complex64', 'complex64',
                         'khcomplex64_t', 'to_khcomplex64_t'),
-          ('Float64', 'float64', 'float64', 'float64_t', ''),
-          ('Float32', 'float32', 'float32', 'float32_t', ''),
-          ('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
-          ('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
-          ('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
-          ('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
-          ('Object', 'object', 'pymap', 'object', '<PyObject*>'),
-          ('Int64', 'int64', 'int64', 'int64_t', ''),
-          ('Int32', 'int32', 'int32', 'int32_t', ''),
-          ('Int16', 'int16', 'int16', 'int16_t', ''),
-          ('Int8', 'int8', 'int8', 'int8_t', '')]
+          ('float64', 'Float64', 'float64', 'float64', 'float64_t', ''),
+          ('float32', 'Float32', 'float32', 'float32', 'float32_t', ''),
+          ('uint64', 'UInt64', 'uint64', 'uint64', 'uint64_t', ''),
+          ('uint32', 'UInt32', 'uint32', 'uint32', 'uint32_t', ''),
+          ('uint16', 'UInt16', 'uint16', 'uint16', 'uint16_t', ''),
+          ('uint8', 'UInt8', 'uint8', 'uint8', 'uint8_t', ''),
+          ('object', 'Object', 'object', 'pymap', 'object', ''),
+          ('string', 'Object', 'object', 'str', 'char *', 'util.get_c_string'),
+          ('int64', 'Int64', 'int64', 'int64', 'int64_t', ''),
+          ('int32', 'Int32', 'int32', 'int32', 'int32_t', ''),
+          ('int16', 'Int16', 'int16', 'int16', 'int16_t', ''),
+          ('int8', 'Int8', 'int8', 'int8', 'int8_t', '')]
 
 }}
 
-{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
+{{for name, vector_name, dtype, ttype, c_type, to_c_type in dtypes}}
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
+cdef value_count_{{name}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
 {{else}}
-cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
+cdef value_count_{{name}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
 {{endif}}
     cdef:
         Py_ssize_t i = 0
@@ -43,12 +44,18 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
         # Don't use Py_ssize_t, since table.n_buckets is unsigned
         khiter_t k
 
+        {{if name != 'string'}}
         {{c_type}} val
+        {{else}}
+        object val
+        {{endif}}
 
         int ret = 0
         bint uses_mask = mask is not None
         bint isna_entry = False
 
+        dict na_dict = dict()
+
     if uses_mask and not dropna:
         raise NotImplementedError("uses_mask not implemented with dropna=False")
 
@@ -57,7 +64,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
     #    table maps keys to counts
     #    result_keys remembers the original order of keys
 
-    result_keys = {{name}}Vector()
+    result_keys = {{vector_name}}Vector()
     table = kh_init_{{ttype}}()
 
     {{if dtype == 'object'}}
@@ -69,11 +76,20 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
     for i in range(n):
         val = values[i]
         if not dropna or not checknull(val):
-            k = kh_get_{{ttype}}(table, {{to_c_type}}val)
+            {{if name == 'string'}}
+            if not isinstance(val, str):
+                if val in na_dict:
+                    na_dict[val] += 1
+                else:
+                    na_dict[val] = 1
+                    result_keys.append(val)
+                continue
+            {{endif}}
+            k = kh_get_{{ttype}}(table, {{to_c_type}}(val))
             if k != table.n_buckets:
                 table.vals[k] += 1
             else:
-                k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
+                k = kh_put_{{ttype}}(table, {{to_c_type}}(val), &ret)
                 table.vals[k] = 1
                 result_keys.append(val)
     {{else}}
@@ -100,11 +116,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
 
     # collect counts in the order corresponding to result_keys:
     cdef:
-        int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
+        int64_t[::1] result_counts = np.empty(len(result_keys), dtype=np.int64)
 
-    for i in range(table.size):
+    for i in range(len(result_keys)):
         {{if dtype == 'object'}}
-        k = kh_get_{{ttype}}(table, result_keys.data[i])
+        val = <object>result_keys.data[i]
+        {{if name == 'string'}}
+        if not isinstance(val, str):
+            result_counts[i] = na_dict[val]
+            continue
+        {{endif}}
+        k = kh_get_{{ttype}}(table, {{to_c_type}}(val))
         {{else}}
         k = kh_get_{{ttype}}(table, result_keys.data.data[i])
         {{endif}}
@@ -115,27 +137,25 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
     return result_keys.to_array(), result_counts.base
 
 
+{{if name != 'string'}}
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
+cdef duplicated_{{name}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
 {{else}}
-cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
+cdef duplicated_{{name}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
 {{endif}}
     cdef:
         int ret = 0
-        {{if dtype != 'object'}}
         {{c_type}} value
-        {{else}}
-        PyObject* value
-        {{endif}}
         Py_ssize_t i, n = len(values), first_na = -1
         khiter_t k
         kh_{{ttype}}_t *table = kh_init_{{ttype}}()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
         bint seen_na = False, uses_mask = mask is not None
         bint seen_multiple_na = False
 
+
     kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
     if keep not in ('last', 'first', False):
@@ -197,19 +217,19 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
 
     kh_destroy_{{ttype}}(table)
     return out
-
+{{endif}}
 
 # ----------------------------------------------------------------------
 # Membership
 # ----------------------------------------------------------------------
 
-
+{{if name != 'string'}}
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
+cdef ismember_{{name}}(ndarray[{{dtype}}] arr, ndarray[{{dtype}}] values):
 {{else}}
-cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
+cdef ismember_{{name}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
 {{endif}}
     """
     Return boolean of values in arr on an
@@ -230,11 +250,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
         int ret = 0
         ndarray[uint8_t] result
 
-        {{if dtype == "object"}}
-        PyObject* val
-        {{else}}
         {{c_type}} val
-        {{endif}}
 
         kh_{{ttype}}_t *table = kh_init_{{ttype}}()
 
@@ -267,7 +283,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
 
     kh_destroy_{{ttype}}(table)
     return result.view(np.bool_)
-
+{{endif}}
 # ----------------------------------------------------------------------
 # Mode Computations
 # ----------------------------------------------------------------------
@@ -283,6 +299,8 @@ ctypedef fused htfunc_t:
 
 cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
     if htfunc_t is object:
+        if lib.is_string_array(values, skipna=True):
+            return value_count_string(values, dropna, mask=mask)
         return value_count_object(values, dropna, mask=mask)
 
     elif htfunc_t is int8_t:

diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
@@ -53,9 +53,9 @@ cdef extern from "khash_python.h":
     kh_pymap_t* kh_init_pymap()
     void kh_destroy_pymap(kh_pymap_t*)
     void kh_clear_pymap(kh_pymap_t*)
-    khuint_t kh_get_pymap(kh_pymap_t*, PyObject*)
+    khuint_t kh_get_pymap(kh_pymap_t*, object)
     void kh_resize_pymap(kh_pymap_t*, khuint_t)
-    khuint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*)
+    khuint_t kh_put_pymap(kh_pymap_t*, object, int*)
     void kh_del_pymap(kh_pymap_t*, khuint_t)
 
     bint kh_exist_pymap(kh_pymap_t*, khiter_t)
@@ -76,7 +76,7 @@ cdef extern from "khash_python.h":
 
     bint kh_exist_pyset(kh_pyset_t*, khiter_t)
 
-    ctypedef char* kh_cstr_t
+    ctypedef const char* kh_cstr_t
 
     ctypedef struct kh_str_t:
         khuint_t n_buckets, size, n_occupied, upper_bound