From 498a4c50d4c8540507baeb275406ca8b91c30b54 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Tue, 13 Oct 2020 21:08:56 +0200
Subject: [PATCH 1/8] adding asv-benchmark hash_functions

---
 asv_bench/benchmarks/hash_functions.py | 164 +++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 asv_bench/benchmarks/hash_functions.py

diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py
new file mode 100644
index 0000000000000..17bf434acf38a
--- /dev/null
+++ b/asv_bench/benchmarks/hash_functions.py
@@ -0,0 +1,164 @@
+import numpy as np
+
+import pandas as pd
+
+
+class IsinAlmostFullWithRandomInt:
+    params = [
+        [np.float64, np.int64, np.uint64, np.object],
+        range(10, 21),
+    ]
+    param_names = ["dtype", "exponent"]
+
+    def setup(self, dtype, exponent):
+        M = 3 * 2 ** (exponent - 2)
+        # 0.77-the maximal share of occupied buckets
+        np.random.seed(42)
+        self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype)
+        self.values = np.random.randint(0, M, M).astype(dtype)
+        self.values_outside = self.values + M
+
+    def time_isin(self, dtype, exponent):
+        self.s.isin(self.values)
+
+    def time_isin_outside(self, dtype, exponent):
+        self.s.isin(self.values_outside)
+
+
+class IsinWithRandomFloat:
+    params = [
+        [np.float64, np.object],
+        [
+            1_300,
+            2_000,
+            7_000,
+            8_000,
+            70_000,
+            80_000,
+            750_000,
+            900_000,
+        ],
+    ]
+    param_names = ["dtype", "M"]
+
+    def setup(self, dtype, M):
+        np.random.seed(42)
+        self.values = np.random.rand(M)
+        self.s = pd.Series(self.values).astype(dtype)
+        np.random.shuffle(self.values)
+        self.values_outside = self.values + 0.1
+
+    def time_isin(self, dtype, M):
+        self.s.isin(self.values)
+
+    def time_isin_outside(self, dtype, M):
+        self.s.isin(self.values_outside)
+
+
+class IsinWithArangeSorted:
+    params = [
+        [np.float64, np.int64, np.uint64, np.object],
+        [
+            1_000,
+            2_000,
+            8_000,
+            100_000,
+            1_000_000,
+        ],
+    ]
+    param_names = ["dtype", "M"]
+
+    def setup(self, dtype, M):
+        self.s = pd.Series(np.arange(M)).astype(dtype)
+        self.values = np.arange(M).astype(dtype)
+
+    def time_isin(self, dtype, M):
+        self.s.isin(self.values)
+
+
+class IsinWithArange:
+    params = [
+        [np.float64, np.int64, np.uint64, np.object],
+        [
+            1_000,
+            2_000,
+            8_000,
+        ],
+        [-2, 0, 2],
+    ]
+    param_names = ["dtype", "M", "offset_factor"]
+
+    def setup(self, dtype, M, offset_factor):
+        offset = int(M * offset_factor)
+        np.random.seed(42)
+        tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6))
+        self.s = tmp.astype(dtype)
+        self.values = np.arange(M).astype(dtype)
+
+    def time_isin(self, dtype, M, offset_factor):
+        self.s.isin(self.values)
+
+
+class Float64GroupIndex:
+    # GH28303
+    def setup(self):
+        self.df = pd.date_range(
+            start="1/1/2018", end="1/2/2018", periods=1e6
+        ).to_frame()
+        self.group_index = np.round(self.df.index.astype(int) / 1e9)
+
+    def time_groupby(self):
+        self.df.groupby(self.group_index).last()
+
+
+class UniqueAndFactorizeArange:
+    params = range(4, 16)
+    param_names = ["exponent"]
+
+    def setup(self, exponent):
+        a = np.arange(10 ** 4, dtype="float64")
+        self.a2 = (a + 10 ** exponent).repeat(100)
+
+    def time_factorize(self, exponent):
+        pd.factorize(self.a2)
+
+    def time_unique(self, exponent):
+        pd.unique(self.a2)
+
+
+class NumericSeriesIndexing:
+
+    params = [
+        (pd.Int64Index, pd.UInt64Index, pd.Float64Index),
+        (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
+    ]
+    param_names = ["index_dtype", "N"]
+
+    def setup(self, index, N):
+        vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
+        indices = index(vals)
+        self.data = pd.Series(np.arange(N), index=indices)
+
+    def time_loc_slice(self, index, N):
+        # trigger building of mapping
+        self.data.loc[:800]
+
+
+class NumericSeriesIndexingShuffled:
+
+    params = [
+        (pd.Int64Index, pd.UInt64Index, pd.Float64Index),
+        (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
+    ]
+    param_names = ["index_dtype", "N"]
+
+    def setup(self, index, N):
+        vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
+        np.random.seed(42)
+        np.random.shuffle(vals)
+        indices = index(vals)
+        self.data = pd.Series(np.arange(N), index=indices)
+
+    def time_loc_slice(self, index, N):
+        # trigger building of mapping
+        self.data.loc[:800]

From 937c9ef69c62b2e6aaa13f2b44e168a4672f50da Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Mon, 5 Oct 2020 00:07:14 +0200
Subject: [PATCH 2/8] using murmur2 for probing step

---
 pandas/_libs/src/klib/khash.h | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h
index 916838d1e9584..5ac7552e367b1 100644
--- a/pandas/_libs/src/klib/khash.h
+++ b/pandas/_libs/src/klib/khash.h
@@ -143,10 +143,38 @@ typedef khint_t khiter_t;
 #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
 #define __ac_set_isdel_true(flag, i) ((void)0)
 
+
+// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
+khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){
+    const khint32_t SEED = 0xc70f6907UL;
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+    const khint32_t M_32 = 0x5bd1e995;
+    const int R_32 = 24;
+
+    // Initialize the hash to a 'random' value
+    khint32_t h = SEED ^ 4;
+
+    //handle 4 bytes:
+    k *= M_32;
+    k ^= k >> R_32;
+    k *= M_32;
+
+    h *= M_32;
+    h ^= k;
+
+    // Do a few final mixes of the hash to ensure the "last few
+    // bytes" are well-incorporated. (Really needed here?)
+    h ^= h >> 13;
+    h *= M_32;
+    h ^= h >> 15;
+    return h;
+}
+
 #ifdef KHASH_LINEAR
 #define __ac_inc(k, m) 1
 #else
-#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
 #endif
 
 #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)

From 3c32be61dc388445676b6704fc1067ad2fd68fc7 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Tue, 13 Oct 2020 21:30:23 +0200
Subject: [PATCH 3/8] [PERF] using murmur2 hash for float64 klib-hash-tables

---
 pandas/_libs/src/klib/khash.h        | 48 ++++++++++++++++++++++++++++
 pandas/_libs/src/klib/khash_python.h | 36 ++++++++++++---------
 2 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h
index 5ac7552e367b1..61a4e80ea8cbc 100644
--- a/pandas/_libs/src/klib/khash.h
+++ b/pandas/_libs/src/klib/khash.h
@@ -171,6 +171,54 @@ khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){
     return h;
 }
 
+// it is possible to have a special x64-version, which would need less operations, but
+// using 32bit version always has also some benifits:
+//    - one code for 32bit and 64bit builds
+//    - the same case for 32bit and 64bit builds
+//    - no performance difference could be measured compared to a possible x64-version
+
+khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){
+    const khint32_t SEED = 0xc70f6907UL;
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+    const khint32_t M_32 = 0x5bd1e995;
+    const int R_32 = 24;
+
+    // Initialize the hash to a 'random' value
+    khint32_t h = SEED ^ 4;
+
+    //handle first 4 bytes:
+    k1 *= M_32;
+    k1 ^= k1 >> R_32;
+    k1 *= M_32;
+
+    h *= M_32;
+    h ^= k1;
+
+    //handle second 4 bytes:
+    k2 *= M_32;
+    k2 ^= k2 >> R_32;
+    k2 *= M_32;
+
+    h *= M_32;
+    h ^= k2;
+
+    // Do a few final mixes of the hash to ensure the "last few
+    // bytes" are well-incorporated.
+    h ^= h >> 13;
+    h *= M_32;
+    h ^= h >> 15;
+    return h;
+}
+
+khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
+    khint32_t k1 = (khint32_t)k;
+    khint32_t k2 = (khint32_t)(k >> 32);
+
+    return murmur2_32_32to32(k1, k2);
+}
+
+
 #ifdef KHASH_LINEAR
 #define __ac_inc(k, m) 1
 #else
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
index 2b46d30c3adb6..e50b09bd00d74 100644
--- a/pandas/_libs/src/klib/khash_python.h
+++ b/pandas/_libs/src/klib/khash_python.h
@@ -13,25 +13,31 @@
 // is 64 bits the truncation causes collission issues.  Given all that, we use our own
 // simple hash, viewing the double bytes as an int64 and using khash's default
 // hash for 64 bit integers.
-// GH 13436
+// GH 13436 showed that _Py_HashDouble doesn't work well with khash
+// GH 28303 showed, that the simple xoring-version isn't good enough
+// thus murmur2-hash is used
+
 khint64_t PANDAS_INLINE asint64(double key) {
-  khint64_t val;
-  memcpy(&val, &key, sizeof(double));
-  return val;
+    khint64_t val;
+    memcpy(&val, &key, sizeof(double));
+    return val;
 }
 
-// correct for all inputs but not -0.0 and NaNs
-#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
-
-// correct for all inputs but not NaNs
-#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ?                       \
-                                        kh_float64_hash_func_0_NAN(0.0) : \
-                                        kh_float64_hash_func_0_NAN(key))
+#define ZERO_HASH 0
+#define NAN_HASH  0
 
-// correct for all
-#define kh_float64_hash_func(key) ((key) != (key) ?                       \
-                                   kh_float64_hash_func_NAN(Py_NAN) :     \
-                                   kh_float64_hash_func_NAN(key))
+khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
+    // 0.0 and -0.0 should have the same hash:
+    if (val == 0.0){
+        return ZERO_HASH;
+    }
+    // all nans should have the same hash:
+    if ( val!=val ){
+        return NAN_HASH;
+    }
+    khint64_t as_int = asint64(val);
+    return murmur2_64to32(as_int);
+}
 
 #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
 

From 450268b751b889b075b70af72ae0c18b4ff83b99 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Wed, 30 Sep 2020 22:09:49 +0200
Subject: [PATCH 4/8] fixing test cases: because the values were the same, the
 order was not unique/stable (obviously depends on hash-values of keys),
 ensure unique ordering

---
 pandas/tests/base/test_value_counts.py |  8 ++------
 pandas/tests/test_algos.py             | 22 ++++++++++------------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index 1a6cba1ace35f..0c93ef242517b 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -232,18 +232,14 @@ def test_value_counts_datetime64(index_or_series):
 
     # with NaT
     s = df["dt"].copy()
-    s = klass(list(s.values) + [pd.NaT])
+    s = klass(list(s.values) + [pd.NaT] * 4)
 
     result = s.value_counts()
     assert result.index.dtype == "datetime64[ns]"
     tm.assert_series_equal(result, expected_s)
 
     result = s.value_counts(dropna=False)
-    # GH 35922. NaN-like now sorts to the beginning of duplicate counts
-    idx = pd.to_datetime(
-        ["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
-    )
-    expected_s = Series([3, 2, 1, 1], index=idx)
+    expected_s = pd.concat([pd.Series([4], index=DatetimeIndex([pd.NaT])), expected_s])
     tm.assert_series_equal(result, expected_s)
 
     unique = s.unique()
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 88286448de900..34b7d0e73e914 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1173,12 +1173,12 @@ def test_dropna(self):
         )
 
         tm.assert_series_equal(
-            Series([True, True, False, None]).value_counts(dropna=True),
-            Series([2, 1], index=[True, False]),
+            Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True),
+            Series([3, 2], index=[True, False]),
         )
         tm.assert_series_equal(
-            Series([True, True, False, None]).value_counts(dropna=False),
-            Series([2, 1, 1], index=[True, np.nan, False]),
+            Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False),
+            Series([5, 3, 2], index=[True, False, np.nan]),
         )
         tm.assert_series_equal(
             Series([10.3, 5.0, 5.0]).value_counts(dropna=True),
@@ -1194,26 +1194,24 @@ def test_dropna(self):
             Series([2, 1], index=[5.0, 10.3]),
         )
 
-        # 32-bit linux has a different ordering
-        if IS64:
-            result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False)
-            expected = Series([2, 1, 1], index=[5.0, np.nan, 10.3])
-            tm.assert_series_equal(result, expected)
+        result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False)
+        expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan])
+        tm.assert_series_equal(result, expected)
 
     def test_value_counts_normalized(self):
         # GH12558
-        s = Series([1, 2, np.nan, np.nan, np.nan])
+        s = Series([1] * 2 + [2] * 3 + [np.nan] * 5)
         dtypes = (np.float64, object, "M8[ns]")
         for t in dtypes:
             s_typed = s.astype(t)
             result = s_typed.value_counts(normalize=True, dropna=False)
             expected = Series(
-                [0.6, 0.2, 0.2], index=Series([np.nan, 1.0, 2.0], dtype=t)
+                [0.5, 0.3, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t)
             )
             tm.assert_series_equal(result, expected)
 
             result = s_typed.value_counts(normalize=True, dropna=True)
-            expected = Series([0.5, 0.5], index=Series([1.0, 2.0], dtype=t))
+            expected = Series([0.6, 0.4], index=Series([2.0, 1.0], dtype=t))
             tm.assert_series_equal(result, expected)
 
     def test_value_counts_uint64(self):

From 641eda13223efa0b4ca630497759fb1ed59c0dc5 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Tue, 3 Nov 2020 06:06:33 +0100
Subject: [PATCH 5/8] fixing doctest: the order of 1,4,2 depends on hash and is
 not unique

---
 pandas/core/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/base.py b/pandas/core/base.py
index 4760b92ad5fec..b3366cca37617 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -982,9 +982,9 @@ def value_counts(
         >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
         >>> index.value_counts()
         3.0    2
-        1.0    1
         2.0    1
         4.0    1
+        1.0    1
         dtype: int64
 
         With `normalize` set to `True`, returns the relative frequency by
@@ -993,9 +993,9 @@ def value_counts(
         >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
         >>> s.value_counts(normalize=True)
         3.0    0.4
-        1.0    0.2
         2.0    0.2
         4.0    0.2
+        1.0    0.2
         dtype: float64
 
         **bins**
@@ -1017,10 +1017,10 @@ def value_counts(
 
         >>> s.value_counts(dropna=False)
         3.0    2
-        1.0    1
         2.0    1
-        4.0    1
         NaN    1
+        4.0    1
+        1.0    1
         dtype: int64
         """
         result = value_counts(

From 309c9d66e3e5ae3516643edd3133f1fbf5d40870 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Tue, 13 Oct 2020 21:41:20 +0200
Subject: [PATCH 6/8] adding whatsnew-note

---
 doc/source/whatsnew/v1.2.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 09cb024cbd95c..0d08a01066cfc 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -376,6 +376,7 @@ Performance improvements
 - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
 - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
 - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
+- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`)
 
 .. ---------------------------------------------------------------------------
 

From f2496381d1dd088df60f29e6a9604dce30abaf1f Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sun, 18 Oct 2020 06:37:42 +0200
Subject: [PATCH 7/8] requested doc changes

---
 doc/source/whatsnew/v1.2.0.rst       | 2 +-
 pandas/_libs/src/klib/khash_python.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 0d08a01066cfc..54ca602ff3c86 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -376,7 +376,7 @@ Performance improvements
 - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
 - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
 - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
-- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`)
+- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
index e50b09bd00d74..aebc229abddd2 100644
--- a/pandas/_libs/src/klib/khash_python.h
+++ b/pandas/_libs/src/klib/khash_python.h
@@ -15,7 +15,7 @@
 // hash for 64 bit integers.
 // GH 13436 showed that _Py_HashDouble doesn't work well with khash
 // GH 28303 showed, that the simple xoring-version isn't good enough
-// thus murmur2-hash is used
+// See GH 36729 for evaluation of the currently used murmur2-hash version
 
 khint64_t PANDAS_INLINE asint64(double key) {
     khint64_t val;

From e743c79693aa7a62d5fef5c24cadd06590261011 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Tue, 3 Nov 2020 08:24:11 +0100
Subject: [PATCH 8/8] use Series rather than pd.Series

---
 pandas/tests/base/test_value_counts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index 0c93ef242517b..e9713e38f9874 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -239,7 +239,7 @@ def test_value_counts_datetime64(index_or_series):
     tm.assert_series_equal(result, expected_s)
 
     result = s.value_counts(dropna=False)
-    expected_s = pd.concat([pd.Series([4], index=DatetimeIndex([pd.NaT])), expected_s])
+    expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s])
     tm.assert_series_equal(result, expected_s)
 
     unique = s.unique()