Skip to content

Commit 2915223

Browse files
f6vmroeschke
authored andcommitted
Improved benchmark coverage for reading spreadsheets (#28230)
* Improved benchmark coverage for reading spreadsheets * Added blank lines * More blank lines * Updated whatsnew * - Removed whatsnew entry - Added comment in environment.yml - Added conda-forge to asv config - Refactored reader benchmark * Updated requirements-dev.txt * Fixed imports order * Fixed imports again * Run black * Changed conda channels order in ASV config * Used setup_cache to speed up read benchmark
1 parent 04e67c4 commit 2915223

File tree

4 files changed

+58
-23
lines changed

4 files changed

+58
-23
lines changed

asv_bench/asv.conf.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,13 @@
5050
"xlsxwriter": [],
5151
"xlrd": [],
5252
"xlwt": [],
53+
"odfpy": [],
5354
"pytest": [],
5455
// If using Windows with python 2.7 and want to build using the
5556
// mingw toolchain (rather than MSVC), uncomment the following line.
5657
// "libpython": [],
5758
},
58-
59+
"conda_channels": ["defaults", "conda-forge"],
5960
// Combinations of libraries/python versions can be excluded/included
6061
// from the set to test. Each entry is a dictionary containing additional
6162
// key-value pairs to include/exclude.

asv_bench/benchmarks/io/excel.py

+54-22
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,72 @@
11
from io import BytesIO
22

33
import numpy as np
4+
from odf.opendocument import OpenDocumentSpreadsheet
5+
from odf.table import Table, TableCell, TableRow
6+
from odf.text import P
47

58
from pandas import DataFrame, ExcelWriter, date_range, read_excel
69
import pandas.util.testing as tm
710

811

9-
class Excel:
12+
def _generate_dataframe():
13+
N = 2000
14+
C = 5
15+
df = DataFrame(
16+
np.random.randn(N, C),
17+
columns=["float{}".format(i) for i in range(C)],
18+
index=date_range("20000101", periods=N, freq="H"),
19+
)
20+
df["object"] = tm.makeStringIndex(N)
21+
return df
22+
23+
24+
class WriteExcel:
1025

1126
params = ["openpyxl", "xlsxwriter", "xlwt"]
1227
param_names = ["engine"]
1328

1429
def setup(self, engine):
15-
N = 2000
16-
C = 5
17-
self.df = DataFrame(
18-
np.random.randn(N, C),
19-
columns=["float{}".format(i) for i in range(C)],
20-
index=date_range("20000101", periods=N, freq="H"),
21-
)
22-
self.df["object"] = tm.makeStringIndex(N)
23-
self.bio_read = BytesIO()
24-
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
25-
self.df.to_excel(self.writer_read, sheet_name="Sheet1")
26-
self.writer_read.save()
27-
self.bio_read.seek(0)
28-
29-
def time_read_excel(self, engine):
30-
read_excel(self.bio_read)
30+
self.df = _generate_dataframe()
3131

3232
def time_write_excel(self, engine):
33-
bio_write = BytesIO()
34-
bio_write.seek(0)
35-
writer_write = ExcelWriter(bio_write, engine=engine)
36-
self.df.to_excel(writer_write, sheet_name="Sheet1")
37-
writer_write.save()
33+
bio = BytesIO()
34+
bio.seek(0)
35+
writer = ExcelWriter(bio, engine=engine)
36+
self.df.to_excel(writer, sheet_name="Sheet1")
37+
writer.save()
38+
39+
40+
class ReadExcel:
41+
42+
params = ["xlrd", "openpyxl", "odf"]
43+
param_names = ["engine"]
44+
fname_excel = "spreadsheet.xlsx"
45+
fname_odf = "spreadsheet.ods"
46+
47+
def _create_odf(self):
48+
doc = OpenDocumentSpreadsheet()
49+
table = Table(name="Table1")
50+
for row in self.df.values:
51+
tr = TableRow()
52+
for val in row:
53+
tc = TableCell(valuetype="string")
54+
tc.addElement(P(text=val))
55+
tr.addElement(tc)
56+
table.addElement(tr)
57+
58+
doc.spreadsheet.addElement(table)
59+
doc.save(self.fname_odf)
60+
61+
def setup_cache(self):
62+
self.df = _generate_dataframe()
63+
64+
self.df.to_excel(self.fname_excel, sheet_name="Sheet1")
65+
self._create_odf()
66+
67+
def time_read_excel(self, engine):
68+
fname = self.fname_odf if engine == "odf" else self.fname_excel
69+
read_excel(fname, engine=engine)
3870

3971

4072
from ..pandas_vb_common import setup # noqa: F401 isort:skip

environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,5 @@ dependencies:
8080
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
8181
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
8282
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
83+
- odfpy # pandas.read_excel
8384
- pyreadstat # pandas.read_spss

requirements-dev.txt

+1
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,5 @@ xarray
5454
xlrd
5555
xlsxwriter
5656
xlwt
57+
odfpy
5758
pyreadstat

0 commit comments

Comments
 (0)