From d87cb64921923ee9e6422491d0c32a451822024b Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 13 Dec 2020 01:47:46 -0500 Subject: [PATCH 1/5] REF: parametrize csv benchmarks on engine --- asv_bench/benchmarks/io/csv.py | 59 +++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9bcd125f56bbb..c166c1510d612 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,4 +1,4 @@ -from io import StringIO +from io import BytesIO, StringIO import random import string @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = [None, 10000] - param_names = ["skiprows"] + params = ([None, 10000], ["c"]) + param_names = ["skiprows", "engine"] - def setup(self, skiprows): + def setup(self, skiprows, engine): N = 20000 index = tm.makeStringIndex(N) df = DataFrame( @@ -164,8 +164,8 @@ def setup(self, skiprows): ) df.to_csv(self.fname) - def time_skipprows(self, skiprows): - read_csv(self.fname, skiprows=skiprows) + def time_skipprows(self, skiprows, engine): + read_csv(self.fname, skiprows=skiprows, engine=engine) class ReadUint64Integers(StringIORewind): @@ -255,8 +255,24 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision): ) -class ReadCSVCategorical(BaseIO): +class ReadCSVEngine(StringIORewind): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): + data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) + # simulate reading from file + self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8")) + + def time_read_stringcsv(self, engine): + read_csv(self.data(self.StringIO_input), engine=engine) + + def time_read_bytescsv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + +class ReadCSVCategorical(BaseIO): fname = "__test__.csv" def setup(self): @@ -273,7 +289,10 @@ def time_convert_direct(self): class ReadCSVParseDates(StringIORewind): - def setup(self): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n @@ -284,18 +303,20 @@ def setup(self): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self): + def time_multiple_date(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]], ) - def time_baseline(self): + def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, parse_dates=[1], @@ -304,17 +325,18 @@ def time_baseline(self): class ReadCSVCachedParseDates(StringIORewind): - params = ([True, False],) - param_names = ["do_cache"] + params = ([True, False], ["c", "python"]) + param_names = ["do_cache", "engine"] - def setup(self, do_cache): + def setup(self, do_cache, engine): data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) - def time_read_csv_cached(self, do_cache): + def time_read_csv_cached(self, do_cache, engine): try: read_csv( self.data(self.StringIO_input), + engine=engine, header=None, parse_dates=[0], cache_dates=do_cache, @@ -344,22 +366,23 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (["mY", "mdY", "hm"],) - param_names = ["value"] + params = (["mY", "mdY", "hm"], ["c", "python"]) + param_names = ["value", "engine"] objects = { "mY": "01-2019\n10-2019\n02/2000\n", "mdY": "12/02/2010\n", "hm": "21:34\n", } - def setup(self, value): + def setup(self, value, engine): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value): + def time_read_special_date(self, value, engine): read_csv( self.data(self.StringIO_input), + engine=engine, sep=",", header=None, names=["Date"], From 071210d48c92435534688dff3b45f61417c7a529 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 14 Dec 2020 20:28:19 -0500 Subject: [PATCH 2/5] review comment --- asv_bench/benchmarks/io/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index c166c1510d612..b1f982f80e35a 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -146,7 +146,7 @@ def time_read_csv(self, bad_date_value): class ReadCSVSkipRows(BaseIO): fname = "__test__.csv" - params = ([None, 10000], ["c"]) + params = ([None, 10000], ["c", "python"]) param_names = ["skiprows", "engine"] def setup(self, skiprows, engine): From ec2fc6ed29daadcb564af952bf58b9f16459b4a7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 15 Dec 2020 11:46:26 -0500 Subject: [PATCH 3/5] review comment --- asv_bench/benchmarks/io/csv.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index b1f982f80e35a..7e40f3da7280f 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -273,7 +273,10 @@ def time_read_bytescsv(self, engine): class ReadCSVCategorical(BaseIO): + fname = "__test__.csv" + params = ["c", "python"] + param_names = ["engine"] def setup(self): N = 100000 From 05bb7a4035eb20835c018128053f87269887d750 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 15 Dec 2020 12:47:56 -0500 Subject: [PATCH 4/5] review comment --- asv_bench/benchmarks/io/csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 7e40f3da7280f..f684a7a7fc6d9 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -278,16 +278,16 @@ class ReadCSVCategorical(BaseIO): params = ["c", "python"] param_names = ["engine"] - def setup(self): + def setup(self, engine): N = 100000 group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"] df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc")) df.to_csv(self.fname, index=False) - def time_convert_post(self): + def time_convert_post(self, engine): read_csv(self.fname).apply(Categorical) - def time_convert_direct(self): + def time_convert_direct(self, engine): read_csv(self.fname, dtype="category") From 761bd740b4fec0040d13102c3b6c2d47c4cc162e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 15 Dec 2020 13:15:37 -0500 Subject: [PATCH 5/5] parametrize asvs --- asv_bench/benchmarks/io/csv.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index f684a7a7fc6d9..24d21ad6a633d 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -192,10 +192,10 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): fname = "__test__.csv" - params = ([",", "|"], [None, ","]) - param_names = ["sep", "thousands"] + params = ([",", "|"], [None, ","], ["c", "python"]) + param_names = ["sep", "thousands", "engine"] - def setup(self, sep, thousands): + def setup(self, sep, thousands, engine): N = 10000 K = 8 data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) @@ -206,16 +206,19 @@ def setup(self, sep, thousands): df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) - def time_thousands(self, sep, thousands): - read_csv(self.fname, sep=sep, thousands=thousands) + def time_thousands(self, sep, thousands, engine): + read_csv(self.fname, sep=sep, thousands=thousands, engine=engine) class ReadCSVComment(StringIORewind): - def setup(self): + params = ["c", "python"] + param_names = ["engine"] + + def setup(self, engine): data = ["A,B,C"] + (["1,2,3 # comment"] * 100000) self.StringIO_input = StringIO("\n".join(data)) - def time_comment(self): + def time_comment(self, engine): read_csv( self.data(self.StringIO_input), comment="#", header=None, names=list("abc") ) @@ -285,10 +288,10 @@ def setup(self, engine): df.to_csv(self.fname, index=False) def time_convert_post(self, engine): - read_csv(self.fname).apply(Categorical) + read_csv(self.fname, engine=engine).apply(Categorical) def time_convert_direct(self, engine): - read_csv(self.fname, dtype="category") + read_csv(self.fname, engine=engine, dtype="category") class ReadCSVParseDates(StringIORewind): @@ -354,15 +357,17 @@ class ReadCSVMemoryGrowth(BaseIO): chunksize = 20 num_rows = 1000 fname = "__test__.csv" + params = ["c", "python"] + param_names = ["engine"] - def setup(self): + def setup(self, engine): with open(self.fname, "w") as f: for i in range(self.num_rows): f.write(f"{i}\n") - def mem_parser_chunks(self): + def mem_parser_chunks(self, engine): # see gh-24805. - result = read_csv(self.fname, chunksize=self.chunksize) + result = read_csv(self.fname, chunksize=self.chunksize, engine=engine) for _ in result: pass