Skip to content

Commit 816dd14

Browse files
arw2019luckyvs1
authored andcommitted
BENCH/REF: parametrize CSV benchmarks on engine (pandas-dev#38442)
1 parent 37c66b4 commit 816dd14

File tree

1 file changed

+63
-32
lines changed

1 file changed

+63
-32
lines changed

asv_bench/benchmarks/io/csv.py

+63-32
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from io import StringIO
1+
from io import BytesIO, StringIO
22
import random
33
import string
44

@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
146146
class ReadCSVSkipRows(BaseIO):
147147

148148
fname = "__test__.csv"
149-
params = [None, 10000]
150-
param_names = ["skiprows"]
149+
params = ([None, 10000], ["c", "python"])
150+
param_names = ["skiprows", "engine"]
151151

152-
def setup(self, skiprows):
152+
def setup(self, skiprows, engine):
153153
N = 20000
154154
index = tm.makeStringIndex(N)
155155
df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
164164
)
165165
df.to_csv(self.fname)
166166

167-
def time_skipprows(self, skiprows):
168-
read_csv(self.fname, skiprows=skiprows)
167+
def time_skipprows(self, skiprows, engine):
168+
read_csv(self.fname, skiprows=skiprows, engine=engine)
169169

170170

171171
class ReadUint64Integers(StringIORewind):
@@ -192,10 +192,10 @@ def time_read_uint64_na_values(self):
192192
class ReadCSVThousands(BaseIO):
193193

194194
fname = "__test__.csv"
195-
params = ([",", "|"], [None, ","])
196-
param_names = ["sep", "thousands"]
195+
params = ([",", "|"], [None, ","], ["c", "python"])
196+
param_names = ["sep", "thousands", "engine"]
197197

198-
def setup(self, sep, thousands):
198+
def setup(self, sep, thousands, engine):
199199
N = 10000
200200
K = 8
201201
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
@@ -206,16 +206,19 @@ def setup(self, sep, thousands):
206206
df = df.applymap(lambda x: fmt.format(x))
207207
df.to_csv(self.fname, sep=sep)
208208

209-
def time_thousands(self, sep, thousands):
210-
read_csv(self.fname, sep=sep, thousands=thousands)
209+
def time_thousands(self, sep, thousands, engine):
210+
read_csv(self.fname, sep=sep, thousands=thousands, engine=engine)
211211

212212

213213
class ReadCSVComment(StringIORewind):
214-
def setup(self):
214+
params = ["c", "python"]
215+
param_names = ["engine"]
216+
217+
def setup(self, engine):
215218
data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
216219
self.StringIO_input = StringIO("\n".join(data))
217220

218-
def time_comment(self):
221+
def time_comment(self, engine):
219222
read_csv(
220223
self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
221224
)
@@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
255258
)
256259

257260

261+
class ReadCSVEngine(StringIORewind):
262+
params = ["c", "python"]
263+
param_names = ["engine"]
264+
265+
def setup(self, engine):
266+
data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
267+
self.StringIO_input = StringIO("\n".join(data))
268+
# simulate reading from file
269+
self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
270+
271+
def time_read_stringcsv(self, engine):
272+
read_csv(self.data(self.StringIO_input), engine=engine)
273+
274+
def time_read_bytescsv(self, engine):
275+
read_csv(self.data(self.BytesIO_input), engine=engine)
276+
277+
258278
class ReadCSVCategorical(BaseIO):
259279

260280
fname = "__test__.csv"
281+
params = ["c", "python"]
282+
param_names = ["engine"]
261283

262-
def setup(self):
284+
def setup(self, engine):
263285
N = 100000
264286
group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
265287
df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
266288
df.to_csv(self.fname, index=False)
267289

268-
def time_convert_post(self):
269-
read_csv(self.fname).apply(Categorical)
290+
def time_convert_post(self, engine):
291+
read_csv(self.fname, engine=engine).apply(Categorical)
270292

271-
def time_convert_direct(self):
272-
read_csv(self.fname, dtype="category")
293+
def time_convert_direct(self, engine):
294+
read_csv(self.fname, engine=engine, dtype="category")
273295

274296

275297
class ReadCSVParseDates(StringIORewind):
276-
def setup(self):
298+
params = ["c", "python"]
299+
param_names = ["engine"]
300+
301+
def setup(self, engine):
277302
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
278303
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
279304
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -284,18 +309,20 @@ def setup(self):
284309
data = data.format(*two_cols)
285310
self.StringIO_input = StringIO(data)
286311

287-
def time_multiple_date(self):
312+
def time_multiple_date(self, engine):
288313
read_csv(
289314
self.data(self.StringIO_input),
315+
engine=engine,
290316
sep=",",
291317
header=None,
292318
names=list(string.digits[:9]),
293319
parse_dates=[[1, 2], [1, 3]],
294320
)
295321

296-
def time_baseline(self):
322+
def time_baseline(self, engine):
297323
read_csv(
298324
self.data(self.StringIO_input),
325+
engine=engine,
299326
sep=",",
300327
header=None,
301328
parse_dates=[1],
@@ -304,17 +331,18 @@ def time_baseline(self):
304331

305332

306333
class ReadCSVCachedParseDates(StringIORewind):
307-
params = ([True, False],)
308-
param_names = ["do_cache"]
334+
params = ([True, False], ["c", "python"])
335+
param_names = ["do_cache", "engine"]
309336

310-
def setup(self, do_cache):
337+
def setup(self, do_cache, engine):
311338
data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
312339
self.StringIO_input = StringIO(data)
313340

314-
def time_read_csv_cached(self, do_cache):
341+
def time_read_csv_cached(self, do_cache, engine):
315342
try:
316343
read_csv(
317344
self.data(self.StringIO_input),
345+
engine=engine,
318346
header=None,
319347
parse_dates=[0],
320348
cache_dates=do_cache,
@@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO):
329357
chunksize = 20
330358
num_rows = 1000
331359
fname = "__test__.csv"
360+
params = ["c", "python"]
361+
param_names = ["engine"]
332362

333-
def setup(self):
363+
def setup(self, engine):
334364
with open(self.fname, "w") as f:
335365
for i in range(self.num_rows):
336366
f.write(f"{i}\n")
337367

338-
def mem_parser_chunks(self):
368+
def mem_parser_chunks(self, engine):
339369
# see gh-24805.
340-
result = read_csv(self.fname, chunksize=self.chunksize)
370+
result = read_csv(self.fname, chunksize=self.chunksize, engine=engine)
341371

342372
for _ in result:
343373
pass
344374

345375

346376
class ReadCSVParseSpecialDate(StringIORewind):
347-
params = (["mY", "mdY", "hm"],)
348-
param_names = ["value"]
377+
params = (["mY", "mdY", "hm"], ["c", "python"])
378+
param_names = ["value", "engine"]
349379
objects = {
350380
"mY": "01-2019\n10-2019\n02/2000\n",
351381
"mdY": "12/02/2010\n",
352382
"hm": "21:34\n",
353383
}
354384

355-
def setup(self, value):
385+
def setup(self, value, engine):
356386
count_elem = 10000
357387
data = self.objects[value] * count_elem
358388
self.StringIO_input = StringIO(data)
359389

360-
def time_read_special_date(self, value):
390+
def time_read_special_date(self, value, engine):
361391
read_csv(
362392
self.data(self.StringIO_input),
393+
engine=engine,
363394
sep=",",
364395
header=None,
365396
names=["Date"],

0 commit comments

Comments
 (0)