Skip to content

Commit f98b4b5

Browse files
chris-b1jreback
authored andcommitted
PERF: float hash slow in py3
closes #13166 closes #13335 Author: Chris <[email protected]> Closes #13436 from chris-b1/float-hash and squashes the following commits: 3aec078 [Chris] smaller benches, explanatory comment 339ad1a [Chris] PERF: float hash slow in py3
1 parent eefe71e commit f98b4b5

File tree

4 files changed

+66
-106
lines changed

4 files changed

+66
-106
lines changed

asv_bench/benchmarks/groupby.py

+36-101
Original file line numberDiff line numberDiff line change
@@ -379,15 +379,24 @@ def time_groupby_dt_timegrouper_size(self):
379379
#----------------------------------------------------------------------
380380
# groupby with a variable value for ngroups
381381

382-
class groupby_ngroups_10000(object):
382+
class groupby_ngroups_int_10000(object):
383383
goal_time = 0.2
384+
dtype = 'int'
385+
ngroups = 10000
384386

385387
def setup(self):
386388
np.random.seed(1234)
387-
self.ngroups = 10000
388-
self.size = (self.ngroups * 2)
389-
self.rng = np.arange(self.ngroups)
390-
self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
389+
size = self.ngroups * 2
390+
rng = np.arange(self.ngroups)
391+
ts = rng.take(np.random.randint(0, self.ngroups, size=size))
392+
if self.dtype == 'int':
393+
value = np.random.randint(0, size, size=size)
394+
else:
395+
value = np.concatenate([np.random.random(self.ngroups) * 0.1,
396+
np.random.random(self.ngroups) * 10.0])
397+
398+
self.df = DataFrame({'timestamp': ts,
399+
'value': value})
391400

392401
def time_all(self):
393402
self.df.groupby('value')['timestamp'].all()
@@ -482,109 +491,35 @@ def time_value_counts(self):
482491
def time_var(self):
483492
self.df.groupby('value')['timestamp'].var()
484493

485-
486-
class groupby_ngroups_100(object):
494+
class groupby_ngroups_int_100(groupby_ngroups_int_10000):
487495
goal_time = 0.2
496+
dtype = 'int'
497+
ngroups = 100
488498

489-
def setup(self):
490-
np.random.seed(1234)
491-
self.ngroups = 100
492-
self.size = (self.ngroups * 2)
493-
self.rng = np.arange(self.ngroups)
494-
self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
495-
496-
def time_all(self):
497-
self.df.groupby('value')['timestamp'].all()
498-
499-
def time_any(self):
500-
self.df.groupby('value')['timestamp'].any()
501-
502-
def time_count(self):
503-
self.df.groupby('value')['timestamp'].count()
504-
505-
def time_cumcount(self):
506-
self.df.groupby('value')['timestamp'].cumcount()
507-
508-
def time_cummax(self):
509-
self.df.groupby('value')['timestamp'].cummax()
510-
511-
def time_cummin(self):
512-
self.df.groupby('value')['timestamp'].cummin()
513-
514-
def time_cumprod(self):
515-
self.df.groupby('value')['timestamp'].cumprod()
516-
517-
def time_cumsum(self):
518-
self.df.groupby('value')['timestamp'].cumsum()
519-
520-
def time_describe(self):
521-
self.df.groupby('value')['timestamp'].describe()
522-
523-
def time_diff(self):
524-
self.df.groupby('value')['timestamp'].diff()
525-
526-
def time_first(self):
527-
self.df.groupby('value')['timestamp'].first()
528-
529-
def time_head(self):
530-
self.df.groupby('value')['timestamp'].head()
531-
532-
def time_last(self):
533-
self.df.groupby('value')['timestamp'].last()
534-
535-
def time_mad(self):
536-
self.df.groupby('value')['timestamp'].mad()
537-
538-
def time_max(self):
539-
self.df.groupby('value')['timestamp'].max()
540-
541-
def time_mean(self):
542-
self.df.groupby('value')['timestamp'].mean()
543-
544-
def time_median(self):
545-
self.df.groupby('value')['timestamp'].median()
546-
547-
def time_min(self):
548-
self.df.groupby('value')['timestamp'].min()
549-
550-
def time_nunique(self):
551-
self.df.groupby('value')['timestamp'].nunique()
552-
553-
def time_pct_change(self):
554-
self.df.groupby('value')['timestamp'].pct_change()
555-
556-
def time_prod(self):
557-
self.df.groupby('value')['timestamp'].prod()
558-
559-
def time_rank(self):
560-
self.df.groupby('value')['timestamp'].rank()
561-
562-
def time_sem(self):
563-
self.df.groupby('value')['timestamp'].sem()
564-
565-
def time_size(self):
566-
self.df.groupby('value')['timestamp'].size()
567-
568-
def time_skew(self):
569-
self.df.groupby('value')['timestamp'].skew()
570-
571-
def time_std(self):
572-
self.df.groupby('value')['timestamp'].std()
499+
class groupby_ngroups_float_100(groupby_ngroups_int_10000):
500+
goal_time = 0.2
501+
dtype = 'float'
502+
ngroups = 100
573503

574-
def time_sum(self):
575-
self.df.groupby('value')['timestamp'].sum()
504+
class groupby_ngroups_float_10000(groupby_ngroups_int_10000):
505+
goal_time = 0.2
506+
dtype = 'float'
507+
ngroups = 10000
576508

577-
def time_tail(self):
578-
self.df.groupby('value')['timestamp'].tail()
579509

580-
def time_unique(self):
581-
self.df.groupby('value')['timestamp'].unique()
510+
class groupby_float32(object):
511+
# GH 13335
512+
goal_time = 0.2
582513

583-
def time_value_counts(self):
584-
self.df.groupby('value')['timestamp'].value_counts()
514+
def setup(self):
515+
tmp1 = (np.random.random(10000) * 0.1).astype(np.float32)
516+
tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
517+
tmp = np.concatenate((tmp1, tmp2))
518+
arr = np.repeat(tmp, 10)
519+
self.df = DataFrame(dict(a=arr, b=arr))
585520

586-
def time_var(self):
587-
self.df.groupby('value')['timestamp'].var()
521+
def time_groupby_sum(self):
522+
self.df.groupby(['a'])['b'].sum()
588523

589524

590525
#----------------------------------------------------------------------

asv_bench/benchmarks/indexing.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -486,4 +486,17 @@ def setup(self):
486486
self.midx = self.midx.take(np.random.permutation(np.arange(100000)))
487487

488488
def time_sort_level_zero(self):
489-
self.midx.sortlevel(0)
489+
self.midx.sortlevel(0)
490+
491+
class float_loc(object):
492+
# GH 13166
493+
goal_time = 0.2
494+
495+
def setup(self):
496+
a = np.arange(100000)
497+
self.ind = pd.Float64Index(a * 4.8000000418824129e-08)
498+
499+
def time_float_loc(self):
500+
self.ind.get_loc(0)
501+
502+

doc/source/whatsnew/v0.18.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ Performance Improvements
345345
- Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`)
346346
- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`)
347347

348-
348+
- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
349349
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
350350

351351

pandas/src/klib/khash_python.h

+15-3
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,21 @@
22

33
#include "khash.h"
44

5-
// kludge
6-
7-
#define kh_float64_hash_func _Py_HashDouble
5+
// Previously we were using the built in cpython hash function for doubles
6+
// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
7+
// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
8+
9+
// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
10+
// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
11+
// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
12+
// is 64 bits the truncation causes collission issues. Given all that, we use our own
13+
// simple hash, viewing the double bytes as an int64 and using khash's default
14+
// hash for 64 bit integers.
15+
// GH 13436
16+
inline khint64_t asint64(double key) {
17+
return *(khint64_t *)(&key);
18+
}
19+
#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
820
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
921

1022
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \

0 commit comments

Comments
 (0)