Skip to content

PERF: float hash slow in py3 #13436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 36 additions & 101 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,15 +379,24 @@ def time_groupby_dt_timegrouper_size(self):
#----------------------------------------------------------------------
# groupby with a variable value for ngroups

class groupby_ngroups_10000(object):
class groupby_ngroups_int_10000(object):
goal_time = 0.2
dtype = 'int'
ngroups = 10000

def setup(self):
np.random.seed(1234)
self.ngroups = 10000
self.size = (self.ngroups * 2)
self.rng = np.arange(self.ngroups)
self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
size = self.ngroups * 2
rng = np.arange(self.ngroups)
ts = rng.take(np.random.randint(0, self.ngroups, size=size))
if self.dtype == 'int':
value = np.random.randint(0, size, size=size)
else:
value = np.concatenate([np.random.random(self.ngroups) * 0.1,
np.random.random(self.ngroups) * 10.0])

self.df = DataFrame({'timestamp': ts,
'value': value})

def time_all(self):
self.df.groupby('value')['timestamp'].all()
Expand Down Expand Up @@ -482,109 +491,35 @@ def time_value_counts(self):
def time_var(self):
self.df.groupby('value')['timestamp'].var()


class groupby_ngroups_100(object):
class groupby_ngroups_int_100(groupby_ngroups_int_10000):
goal_time = 0.2
dtype = 'int'
ngroups = 100

def setup(self):
np.random.seed(1234)
self.ngroups = 100
self.size = (self.ngroups * 2)
self.rng = np.arange(self.ngroups)
self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))

def time_all(self):
self.df.groupby('value')['timestamp'].all()

def time_any(self):
self.df.groupby('value')['timestamp'].any()

def time_count(self):
self.df.groupby('value')['timestamp'].count()

def time_cumcount(self):
self.df.groupby('value')['timestamp'].cumcount()

def time_cummax(self):
self.df.groupby('value')['timestamp'].cummax()

def time_cummin(self):
self.df.groupby('value')['timestamp'].cummin()

def time_cumprod(self):
self.df.groupby('value')['timestamp'].cumprod()

def time_cumsum(self):
self.df.groupby('value')['timestamp'].cumsum()

def time_describe(self):
self.df.groupby('value')['timestamp'].describe()

def time_diff(self):
self.df.groupby('value')['timestamp'].diff()

def time_first(self):
self.df.groupby('value')['timestamp'].first()

def time_head(self):
self.df.groupby('value')['timestamp'].head()

def time_last(self):
self.df.groupby('value')['timestamp'].last()

def time_mad(self):
self.df.groupby('value')['timestamp'].mad()

def time_max(self):
self.df.groupby('value')['timestamp'].max()

def time_mean(self):
self.df.groupby('value')['timestamp'].mean()

def time_median(self):
self.df.groupby('value')['timestamp'].median()

def time_min(self):
self.df.groupby('value')['timestamp'].min()

def time_nunique(self):
self.df.groupby('value')['timestamp'].nunique()

def time_pct_change(self):
self.df.groupby('value')['timestamp'].pct_change()

def time_prod(self):
self.df.groupby('value')['timestamp'].prod()

def time_rank(self):
self.df.groupby('value')['timestamp'].rank()

def time_sem(self):
self.df.groupby('value')['timestamp'].sem()

def time_size(self):
self.df.groupby('value')['timestamp'].size()

def time_skew(self):
self.df.groupby('value')['timestamp'].skew()

def time_std(self):
self.df.groupby('value')['timestamp'].std()
class groupby_ngroups_float_100(groupby_ngroups_int_10000):
goal_time = 0.2
dtype = 'float'
ngroups = 100

def time_sum(self):
self.df.groupby('value')['timestamp'].sum()
class groupby_ngroups_float_10000(groupby_ngroups_int_10000):
goal_time = 0.2
dtype = 'float'
ngroups = 10000

def time_tail(self):
self.df.groupby('value')['timestamp'].tail()

def time_unique(self):
self.df.groupby('value')['timestamp'].unique()
class groupby_float32(object):
# GH 13335
goal_time = 0.2

def time_value_counts(self):
self.df.groupby('value')['timestamp'].value_counts()
def setup(self):
tmp1 = (np.random.random(10000) * 0.1).astype(np.float32)
tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
tmp = np.concatenate((tmp1, tmp2))
arr = np.repeat(tmp, 100)
self.df = DataFrame(dict(a=arr, b=arr))

def time_var(self):
self.df.groupby('value')['timestamp'].var()
def time_groupby_sum(self):
self.df.groupby(['a'])['b'].sum()


#----------------------------------------------------------------------
Expand Down
15 changes: 14 additions & 1 deletion asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,4 +486,17 @@ def setup(self):
self.midx = self.midx.take(np.random.permutation(np.arange(100000)))

def time_sort_level_zero(self):
self.midx.sortlevel(0)
self.midx.sortlevel(0)

class float_loc(object):
# GH 13166
goal_time = 0.2

def setup(self):
a = np.arange(1000000)
self.ind = pd.Float64Index(a * 4.8000000418824129e-08)

def time_float_loc(self):
self.ind.get_loc(0)


2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ Performance Improvements
- Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`)
- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`)


- Improved performance of float64 hash table fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)


Expand Down
7 changes: 4 additions & 3 deletions pandas/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

#include "khash.h"

// kludge

#define kh_float64_hash_func _Py_HashDouble
inline khint64_t asint64(double key) {
return *(khint64_t *)(&key);
Copy link
Contributor

@jreback jreback Jun 14, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment (and a link to the python source code) of why we are not using _Py_HashDouble here (and this issue number as well). And maybe an explanation of what this is doing and why its better (in pandas).

}
#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))

#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
Expand Down