Skip to content

BENCH/REF: parametrize CSV benchmarks on engine #38442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Dec 17, 2020
59 changes: 41 additions & 18 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from io import StringIO
from io import BytesIO, StringIO
import random
import string

Expand Down Expand Up @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
class ReadCSVSkipRows(BaseIO):

fname = "__test__.csv"
params = [None, 10000]
param_names = ["skiprows"]
params = ([None, 10000], ["c"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this supposed to be c and python or just c?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On current master we only have the default (I think that's c)

We have c and python in all the other benchmarks so it seems reasonable to add that here. If there's a reason we don't want that I'll revert

param_names = ["skiprows", "engine"]

def setup(self, skiprows):
def setup(self, skiprows, engine):
N = 20000
index = tm.makeStringIndex(N)
df = DataFrame(
Expand All @@ -164,8 +164,8 @@ def setup(self, skiprows):
)
df.to_csv(self.fname)

def time_skipprows(self, skiprows):
read_csv(self.fname, skiprows=skiprows)
def time_skipprows(self, skiprows, engine):
read_csv(self.fname, skiprows=skiprows, engine=engine)


class ReadUint64Integers(StringIORewind):
Expand Down Expand Up @@ -255,8 +255,24 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
)


class ReadCSVCategorical(BaseIO):
class ReadCSVEngine(StringIORewind):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
self.StringIO_input = StringIO("\n".join(data))
# simulate reading from file
self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))

def time_read_stringcsv(self, engine):
read_csv(self.data(self.StringIO_input), engine=engine)

def time_read_bytescsv(self, engine):
read_csv(self.data(self.BytesIO_input), engine=engine)


class ReadCSVCategorical(BaseIO):
fname = "__test__.csv"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could also parametrize this one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

def setup(self):
Expand All @@ -273,7 +289,10 @@ def time_convert_direct(self):


class ReadCSVParseDates(StringIORewind):
def setup(self):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
Expand All @@ -284,18 +303,20 @@ def setup(self):
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)

def time_multiple_date(self):
def time_multiple_date(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=list(string.digits[:9]),
parse_dates=[[1, 2], [1, 3]],
)

def time_baseline(self):
def time_baseline(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
parse_dates=[1],
Expand All @@ -304,17 +325,18 @@ def time_baseline(self):


class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
param_names = ["do_cache"]
params = ([True, False], ["c", "python"])
param_names = ["do_cache", "engine"]

def setup(self, do_cache):
def setup(self, do_cache, engine):
data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
self.StringIO_input = StringIO(data)

def time_read_csv_cached(self, do_cache):
def time_read_csv_cached(self, do_cache, engine):
try:
read_csv(
self.data(self.StringIO_input),
engine=engine,
header=None,
parse_dates=[0],
cache_dates=do_cache,
Expand Down Expand Up @@ -344,22 +366,23 @@ def mem_parser_chunks(self):


class ReadCSVParseSpecialDate(StringIORewind):
params = (["mY", "mdY", "hm"],)
param_names = ["value"]
params = (["mY", "mdY", "hm"], ["c", "python"])
param_names = ["value", "engine"]
objects = {
"mY": "01-2019\n10-2019\n02/2000\n",
"mdY": "12/02/2010\n",
"hm": "21:34\n",
}

def setup(self, value):
def setup(self, value, engine):
count_elem = 10000
data = self.objects[value] * count_elem
self.StringIO_input = StringIO(data)

def time_read_special_date(self, value):
def time_read_special_date(self, value, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=["Date"],
Expand Down