Skip to content

PERF: Benchmark merge with non-int64 and tolerance (#28922) #28974

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 22, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 46 additions & 13 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,10 +273,10 @@ def time_merge_ordered(self):


class MergeAsof:
params = [["backward", "forward", "nearest"]]
param_names = ["direction"]
params = [["backward", "forward", "nearest"], [None, 5]]
param_names = ["direction", "tolerance"]

def setup(self, direction):
def setup(self, direction, tolerance):
one_count = 200000
two_count = 1000000

Expand All @@ -303,6 +303,9 @@ def setup(self, direction):
df1["time32"] = np.int32(df1.time)
df2["time32"] = np.int32(df2.time)

df1["timeu64"] = np.uint64(df1.time)
df2["timeu64"] = np.uint64(df2.time)

self.df1a = df1[["time", "value1"]]
self.df2a = df2[["time", "value2"]]
self.df1b = df1[["time", "key", "value1"]]
Expand All @@ -313,22 +316,52 @@ def setup(self, direction):
self.df2d = df2[["time32", "value2"]]
self.df1e = df1[["time", "key", "key2", "value1"]]
self.df2e = df2[["time", "key", "key2", "value2"]]
self.df1f = df1[["timeu64", "value1"]]
self.df2f = df2[["timeu64", "value2"]]

def time_on_int(self, direction, tolerance):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of separate functions like this can you also just parametrze on dtype?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want all of the functions to implement the dtype, or only do this for the time_on functions?

The option to leave the dtype out for the others would be for example to negate it using the conventional _ in the list of arguments, but I think this wouldn't be the cleanest of cases?

Suggested change
def time_on_int(self, direction, tolerance):
def time_on(self, dtype, direction, tolerance):
self.df1a["time"] = self.df1a.time.astype(dtype)
self.df2a["time"] = self.df2a.time.astype(dtype)
...
def time_by_object(self, _, direction, tolerance):
...
)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm OK I see your point. I think OK to leave as is then

merge_asof(
self.df1a, self.df2a, on="time", direction=direction, tolerance=tolerance
)

def time_on_int(self, direction):
merge_asof(self.df1a, self.df2a, on="time", direction=direction)
def time_on_int32(self, direction, tolerance):
merge_asof(
self.df1d, self.df2d, on="time32", direction=direction, tolerance=tolerance
)

def time_on_int32(self, direction):
merge_asof(self.df1d, self.df2d, on="time32", direction=direction)
def time_on_uint64(self, direction, tolerance):
merge_asof(
self.df1f, self.df2f, on="timeu64", direction=direction, tolerance=tolerance
)

def time_by_object(self, direction):
merge_asof(self.df1b, self.df2b, on="time", by="key", direction=direction)
def time_by_object(self, direction, tolerance):
merge_asof(
self.df1b,
self.df2b,
on="time",
by="key",
direction=direction,
tolerance=tolerance,
)

def time_by_int(self, direction):
merge_asof(self.df1c, self.df2c, on="time", by="key2", direction=direction)
def time_by_int(self, direction, tolerance):
merge_asof(
self.df1c,
self.df2c,
on="time",
by="key2",
direction=direction,
tolerance=tolerance,
)

def time_multiby(self, direction):
def time_multiby(self, direction, tolerance):
merge_asof(
self.df1e, self.df2e, on="time", by=["key", "key2"], direction=direction
self.df1e,
self.df2e,
on="time",
by=["key", "key2"],
direction=direction,
tolerance=tolerance,
)


Expand Down