Skip to content

BENCH/REF: parametrize CSV benchmarks on engine #38442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Dec 17, 2020
95 changes: 63 additions & 32 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from io import StringIO
from io import BytesIO, StringIO
import random
import string

Expand Down Expand Up @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
class ReadCSVSkipRows(BaseIO):

fname = "__test__.csv"
params = [None, 10000]
param_names = ["skiprows"]
params = ([None, 10000], ["c", "python"])
param_names = ["skiprows", "engine"]

def setup(self, skiprows):
def setup(self, skiprows, engine):
N = 20000
index = tm.makeStringIndex(N)
df = DataFrame(
Expand All @@ -164,8 +164,8 @@ def setup(self, skiprows):
)
df.to_csv(self.fname)

def time_skipprows(self, skiprows):
read_csv(self.fname, skiprows=skiprows)
def time_skipprows(self, skiprows, engine):
read_csv(self.fname, skiprows=skiprows, engine=engine)


class ReadUint64Integers(StringIORewind):
Expand All @@ -192,10 +192,10 @@ def time_read_uint64_na_values(self):
class ReadCSVThousands(BaseIO):

fname = "__test__.csv"
params = ([",", "|"], [None, ","])
param_names = ["sep", "thousands"]
params = ([",", "|"], [None, ","], ["c", "python"])
param_names = ["sep", "thousands", "engine"]

def setup(self, sep, thousands):
def setup(self, sep, thousands, engine):
N = 10000
K = 8
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
Expand All @@ -206,16 +206,19 @@ def setup(self, sep, thousands):
df = df.applymap(lambda x: fmt.format(x))
df.to_csv(self.fname, sep=sep)

def time_thousands(self, sep, thousands):
read_csv(self.fname, sep=sep, thousands=thousands)
def time_thousands(self, sep, thousands, engine):
read_csv(self.fname, sep=sep, thousands=thousands, engine=engine)


class ReadCSVComment(StringIORewind):
def setup(self):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
self.StringIO_input = StringIO("\n".join(data))

def time_comment(self):
def time_comment(self, engine):
read_csv(
self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
)
Expand Down Expand Up @@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
)


class ReadCSVEngine(StringIORewind):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
self.StringIO_input = StringIO("\n".join(data))
# simulate reading from file
self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))

def time_read_stringcsv(self, engine):
read_csv(self.data(self.StringIO_input), engine=engine)

def time_read_bytescsv(self, engine):
read_csv(self.data(self.BytesIO_input), engine=engine)


class ReadCSVCategorical(BaseIO):

fname = "__test__.csv"
params = ["c", "python"]
param_names = ["engine"]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could also parametrize this one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

def setup(self):
def setup(self, engine):
N = 100000
group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
df.to_csv(self.fname, index=False)

def time_convert_post(self):
read_csv(self.fname).apply(Categorical)
def time_convert_post(self, engine):
read_csv(self.fname, engine=engine).apply(Categorical)

def time_convert_direct(self):
read_csv(self.fname, dtype="category")
def time_convert_direct(self, engine):
read_csv(self.fname, engine=engine, dtype="category")


class ReadCSVParseDates(StringIORewind):
def setup(self):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
Expand All @@ -284,18 +309,20 @@ def setup(self):
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)

def time_multiple_date(self):
def time_multiple_date(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=list(string.digits[:9]),
parse_dates=[[1, 2], [1, 3]],
)

def time_baseline(self):
def time_baseline(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
parse_dates=[1],
Expand All @@ -304,17 +331,18 @@ def time_baseline(self):


class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
param_names = ["do_cache"]
params = ([True, False], ["c", "python"])
param_names = ["do_cache", "engine"]

def setup(self, do_cache):
def setup(self, do_cache, engine):
data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
self.StringIO_input = StringIO(data)

def time_read_csv_cached(self, do_cache):
def time_read_csv_cached(self, do_cache, engine):
try:
read_csv(
self.data(self.StringIO_input),
engine=engine,
header=None,
parse_dates=[0],
cache_dates=do_cache,
Expand All @@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO):
chunksize = 20
num_rows = 1000
fname = "__test__.csv"
params = ["c", "python"]
param_names = ["engine"]

def setup(self):
def setup(self, engine):
with open(self.fname, "w") as f:
for i in range(self.num_rows):
f.write(f"{i}\n")

def mem_parser_chunks(self):
def mem_parser_chunks(self, engine):
# see gh-24805.
result = read_csv(self.fname, chunksize=self.chunksize)
result = read_csv(self.fname, chunksize=self.chunksize, engine=engine)

for _ in result:
pass


class ReadCSVParseSpecialDate(StringIORewind):
params = (["mY", "mdY", "hm"],)
param_names = ["value"]
params = (["mY", "mdY", "hm"], ["c", "python"])
param_names = ["value", "engine"]
objects = {
"mY": "01-2019\n10-2019\n02/2000\n",
"mdY": "12/02/2010\n",
"hm": "21:34\n",
}

def setup(self, value):
def setup(self, value, engine):
count_elem = 10000
data = self.objects[value] * count_elem
self.StringIO_input = StringIO(data)

def time_read_special_date(self, value):
def time_read_special_date(self, value, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=["Date"],
Expand Down