From f0706b1843883f12807baeaad8a6046dd8f767e9 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Mon, 17 Aug 2015 20:46:55 -0500
Subject: [PATCH 1/3] PERF: value_counts_float64 #10821

---
 doc/source/whatsnew/v0.17.0.txt |  2 +-
 pandas/core/algorithms.py       |  3 +++
 pandas/hashtable.pyx            | 45 +++++++++++++++++++++++++++++++++
 vb_suite/groupby.py             |  9 +++++++
 4 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
index 7e69a8044a305..8ae2aa1659077 100644
--- a/doc/source/whatsnew/v0.17.0.txt
+++ b/doc/source/whatsnew/v0.17.0.txt
@@ -585,7 +585,7 @@ Performance Improvements
 - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
 - 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
 - Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)
-
+- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`)
 
 .. _whatsnew_0170.bug_fixes:
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index b0c7ff43bc7d8..21ace4bb1832d 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -245,6 +245,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
         elif com.is_integer_dtype(dtype):
             values = com._ensure_int64(values)
             keys, counts = htable.value_count_int64(values)
+        elif com.is_float_dtype(dtype):
+            values = com._ensure_float64(values)
+            keys, counts = htable.value_count_float64(values, dropna)
 
         else:
             values = com._ensure_object(values)
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index 7dbd1b45c938f..fa3e2f6c4ba0b 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -866,7 +866,52 @@ cdef class Int64Factorizer:
         self.count = len(self.uniques)
         return labels
 
+@cython.boundscheck(False)
+cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dropna):
+    cdef:
+        khiter_t k
+        Py_ssize_t i, n = len(values)
+        float64_t val
+        int ret = 0
+
+    with nogil:
+        kh_resize_float64(table, n)
+
+        for i in range(n):
+            val = values[i]
+            if val == val or not dropna:
+                k = kh_get_float64(table, val)
+                if k != table.n_buckets:
+                    table.vals[k] += 1
+                else:
+                    k = kh_put_float64(table, val, &ret)
+                    table.vals[k] = 1
+
+@cython.boundscheck(False)
+cpdef value_count_float64(float64_t[:] values, bint dropna):
+    cdef:
+        Py_ssize_t i
+        kh_float64_t * table
+        float64_t[:] result_keys
+        int64_t[:] result_counts
+        int k
+
+    table = kh_init_float64()
+    build_count_table_float64(values, table, dropna)
+
+    i = 0
+    result_keys = np.empty(table.n_occupied, dtype=np.float64)
+    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
 
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_float64(table, k):
+                result_keys[i] = table.keys[k]
+                result_counts[i] = table.vals[k]
+                i += 1
+    kh_destroy_float64(table)
+
+    return np.asarray(result_keys), np.asarray(result_counts)
 
 @cython.boundscheck(False)
 cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
index 73f5f19d6a626..bceb78c26e6ac 100644
--- a/vb_suite/groupby.py
+++ b/vb_suite/groupby.py
@@ -194,6 +194,15 @@ def f():
 series_value_counts_strings = Benchmark('s.value_counts()', setup,
                                         start_date=datetime(2011, 10, 21))
 
+#value_counts on float dtype
+
+setup = common_setup + """
+s = Series(np.random.randint(0, 1000, size=100000)).astype(float)
+"""
+
+series_value_counts_float64 = Benchmark('s.value_counts()', setup,
+                                      start_date=datetime(2015, 8, 17))
+
 #----------------------------------------------------------------------
 # pivot_table
 

From cf002dcce4adcf45e1256b6f16aa2e2833a4cfc5 Mon Sep 17 00:00:00 2001
From: Ian Henriksen <insertinterestingnamehere@gmail.com>
Date: Tue, 18 Aug 2015 11:25:25 -0600
Subject: [PATCH 2/3] CLN: Combined value_count_in64 and value_count_float64
 into a single routine using fused types.

---
 pandas/core/algorithms.py  |  6 +--
 pandas/core/categorical.py |  2 +-
 pandas/hashtable.pyx       | 75 +++++++++++++++++---------------------
 3 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 21ace4bb1832d..0b11a2bae3973 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -232,7 +232,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
                 values = PeriodIndex(values, name=name)
 
             values = values.view(np.int64)
-            keys, counts = htable.value_count_int64(values)
+            keys, counts = htable.value_count_scalar64(values, dropna)
 
             if dropna:
                 from pandas.tslib import iNaT
@@ -244,10 +244,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
         elif com.is_integer_dtype(dtype):
             values = com._ensure_int64(values)
-            keys, counts = htable.value_count_int64(values)
+            keys, counts = htable.value_count_scalar64(values, dropna)
         elif com.is_float_dtype(dtype):
             values = com._ensure_float64(values)
-            keys, counts = htable.value_count_float64(values, dropna)
+            keys, counts = htable.value_count_scalar64(values, dropna)
 
         else:
             values = com._ensure_object(values)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index c9e30ea31dab8..b204cba997b98 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -1030,7 +1030,7 @@ def value_counts(self, dropna=True):
         from pandas.core.index import CategoricalIndex
 
         cat = self.dropna() if dropna else self
-        keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
+        keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna)
         result = Series(counts, index=keys)
 
         ix = np.arange(len(cat.categories), dtype='int64')
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index fa3e2f6c4ba0b..573db92b53565 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -887,29 +887,48 @@ cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dr
                     k = kh_put_float64(table, val, &ret)
                     table.vals[k] = 1
 
+
 @cython.boundscheck(False)
-cpdef value_count_float64(float64_t[:] values, bint dropna):
+cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
     cdef:
         Py_ssize_t i
-        kh_float64_t * table
-        float64_t[:] result_keys
+        kh_float64_t *ftable
+        kh_int64_t *itable
+        sixty_four_bit_scalar[:] result_keys
         int64_t[:] result_counts
         int k
 
-    table = kh_init_float64()
-    build_count_table_float64(values, table, dropna)
-
     i = 0
-    result_keys = np.empty(table.n_occupied, dtype=np.float64)
-    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
 
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_float64(table, k):
-                result_keys[i] = table.keys[k]
-                result_counts[i] = table.vals[k]
-                i += 1
-    kh_destroy_float64(table)
+    if sixty_four_bit_scalar is float64_t:
+        ftable = kh_init_float64()
+        build_count_table_float64(values, ftable, dropna)
+
+        result_keys = np.empty(ftable.n_occupied, dtype=np.float64)
+        result_counts = np.zeros(ftable.n_occupied, dtype=np.int64)
+
+        with nogil:
+            for k in range(ftable.n_buckets):
+                if kh_exist_float64(ftable, k):
+                    result_keys[i] = ftable.keys[k]
+                    result_counts[i] = ftable.vals[k]
+                    i += 1
+        kh_destroy_float64(ftable)
+
+    elif sixty_four_bit_scalar is int64_t:
+        itable = kh_init_int64()
+        build_count_table_int64(values, itable)
+
+        result_keys = np.empty(itable.n_occupied, dtype=np.int64)
+        result_counts = np.zeros(itable.n_occupied, dtype=np.int64)
+
+        with nogil:
+            for k in range(itable.n_buckets):
+                if kh_exist_int64(itable, k):
+                    result_keys[i] = itable.keys[k]
+                    result_counts[i] = itable.vals[k]
+                    i += 1
+        kh_destroy_int64(itable)
 
     return np.asarray(result_keys), np.asarray(result_counts)
 
@@ -934,32 +953,6 @@ cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
                 table.vals[k] = 1
 
 
-@cython.boundscheck(False)
-cpdef value_count_int64(int64_t[:] values):
-    cdef:
-        Py_ssize_t i
-        kh_int64_t *table
-        int64_t[:] result_keys, result_counts
-        int k
-
-    table = kh_init_int64()
-    build_count_table_int64(values, table)
-
-    i = 0
-    result_keys = np.empty(table.n_occupied, dtype=np.int64)
-    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
-
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_int64(table, k):
-                result_keys[i] = table.keys[k]
-                result_counts[i] = table.vals[k]
-                i += 1
-    kh_destroy_int64(table)
-
-    return np.asarray(result_keys), np.asarray(result_counts)
-
-
 cdef build_count_table_object(ndarray[object] values,
                               ndarray[uint8_t, cast=True] mask,
                               kh_pymap_t *table):

From 8bb17cb02c31bb89e87a8fbddec0537fabd1d81d Mon Sep 17 00:00:00 2001
From: Ian Henriksen <insertinterestingnamehere@gmail.com>
Date: Tue, 18 Aug 2015 11:39:15 -0600
Subject: [PATCH 3/3] CLN: Combined build_count_table_int64 and
 build_count_table_float64 into a single function using fused types.

---
 pandas/hashtable.pyx | 65 ++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index 573db92b53565..dfa7930ada62f 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -866,26 +866,47 @@ cdef class Int64Factorizer:
         self.count = len(self.uniques)
         return labels
 
+ctypedef fused kh_scalar64:
+    kh_int64_t
+    kh_float64_t
+
 @cython.boundscheck(False)
-cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dropna):
+cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values,
+                                kh_scalar64 *table, bint dropna):
     cdef:
         khiter_t k
         Py_ssize_t i, n = len(values)
-        float64_t val
+        sixty_four_bit_scalar val
         int ret = 0
 
-    with nogil:
-        kh_resize_float64(table, n)
+    if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t:
+        with nogil:
+            kh_resize_float64(table, n)
 
-        for i in range(n):
-            val = values[i]
-            if val == val or not dropna:
-                k = kh_get_float64(table, val)
+            for i in range(n):
+                val = values[i]
+                if val == val or not dropna:
+                    k = kh_get_float64(table, val)
+                    if k != table.n_buckets:
+                        table.vals[k] += 1
+                    else:
+                        k = kh_put_float64(table, val, &ret)
+                        table.vals[k] = 1
+    elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t:
+        with nogil:
+            kh_resize_int64(table, n)
+
+            for i in range(n):
+                val = values[i]
+                k = kh_get_int64(table, val)
                 if k != table.n_buckets:
                     table.vals[k] += 1
                 else:
-                    k = kh_put_float64(table, val, &ret)
+                    k = kh_put_int64(table, val, &ret)
                     table.vals[k] = 1
+    else:
+        raise ValueError("Table type must match scalar type.")
+
 
 
 @cython.boundscheck(False)
@@ -902,7 +923,7 @@ cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
 
     if sixty_four_bit_scalar is float64_t:
         ftable = kh_init_float64()
-        build_count_table_float64(values, ftable, dropna)
+        build_count_table_scalar64(values, ftable, dropna)
 
         result_keys = np.empty(ftable.n_occupied, dtype=np.float64)
         result_counts = np.zeros(ftable.n_occupied, dtype=np.int64)
@@ -917,7 +938,7 @@ cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
 
     elif sixty_four_bit_scalar is int64_t:
         itable = kh_init_int64()
-        build_count_table_int64(values, itable)
+        build_count_table_scalar64(values, itable, dropna)
 
         result_keys = np.empty(itable.n_occupied, dtype=np.int64)
         result_counts = np.zeros(itable.n_occupied, dtype=np.int64)
@@ -932,26 +953,6 @@ cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
 
     return np.asarray(result_keys), np.asarray(result_counts)
 
-@cython.boundscheck(False)
-cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
-    cdef:
-        khiter_t k
-        Py_ssize_t i, n = len(values)
-        int64_t val
-        int ret = 0
-
-    with nogil:
-        kh_resize_int64(table, n)
-
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int64(table, val)
-            if k != table.n_buckets:
-                table.vals[k] += 1
-            else:
-                k = kh_put_int64(table, val, &ret)
-                table.vals[k] = 1
-
 
 cdef build_count_table_object(ndarray[object] values,
                               ndarray[uint8_t, cast=True] mask,
@@ -1040,7 +1041,7 @@ def mode_int64(int64_t[:] values):
 
     table = kh_init_int64()
 
-    build_count_table_int64(values, table)
+    build_count_table_scalar64(values, table, 0)
 
     modes = np.empty(table.n_buckets, dtype=np.int64)