Skip to content

Improved benchmark coverage for reading spreadsheets #28230

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 5, 2019
3 changes: 2 additions & 1 deletion asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,13 @@
"xlsxwriter": [],
"xlrd": [],
"xlwt": [],
"odfpy": [],
"pytest": [],
// If using Windows with python 2.7 and want to build using the
// mingw toolchain (rather than MSVC), uncomment the following line.
// "libpython": [],
},

"conda_channels": ["defaults", "conda-forge"],
// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
// key-value pairs to include/exclude.
Expand Down
76 changes: 54 additions & 22 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,72 @@
from io import BytesIO

import numpy as np
from odf.opendocument import OpenDocumentSpreadsheet
from odf.table import Table, TableCell, TableRow
from odf.text import P

from pandas import DataFrame, ExcelWriter, date_range, read_excel
import pandas.util.testing as tm


class Excel:
def _generate_dataframe():
N = 2000
C = 5
df = DataFrame(
np.random.randn(N, C),
columns=["float{}".format(i) for i in range(C)],
index=date_range("20000101", periods=N, freq="H"),
)
df["object"] = tm.makeStringIndex(N)
return df


class WriteExcel:

params = ["openpyxl", "xlsxwriter", "xlwt"]
param_names = ["engine"]

def setup(self, engine):
N = 2000
C = 5
self.df = DataFrame(
np.random.randn(N, C),
columns=["float{}".format(i) for i in range(C)],
index=date_range("20000101", periods=N, freq="H"),
)
self.df["object"] = tm.makeStringIndex(N)
self.bio_read = BytesIO()
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
self.df.to_excel(self.writer_read, sheet_name="Sheet1")
self.writer_read.save()
self.bio_read.seek(0)

def time_read_excel(self, engine):
read_excel(self.bio_read)
self.df = _generate_dataframe()

def time_write_excel(self, engine):
bio_write = BytesIO()
bio_write.seek(0)
writer_write = ExcelWriter(bio_write, engine=engine)
self.df.to_excel(writer_write, sheet_name="Sheet1")
writer_write.save()
bio = BytesIO()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason for changing this? I think self-contained

Copy link
Contributor Author

@f6v f6v Aug 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The naming was obsolete. Before the PR, all the setup was done in one function for both read and write benchmarks. Hence the variable names having postfixes _read and _write were justified, but not anymore, I guess?

bio.seek(0)
writer = ExcelWriter(bio, engine=engine)
self.df.to_excel(writer, sheet_name="Sheet1")
writer.save()


class ReadExcel:

params = ["xlrd", "openpyxl", "odf"]
param_names = ["engine"]
fname_excel = "spreadsheet.xlsx"
fname_odf = "spreadsheet.ods"

def _create_odf(self):
doc = OpenDocumentSpreadsheet()
table = Table(name="Table1")
for row in self.df.values:
tr = TableRow()
for val in row:
tc = TableCell(valuetype="string")
tc.addElement(P(text=val))
tr.addElement(tc)
table.addElement(tr)

doc.spreadsheet.addElement(table)
doc.save(self.fname_odf)

def setup_cache(self):
self.df = _generate_dataframe()

self.df.to_excel(self.fname_excel, sheet_name="Sheet1")
self._create_odf()

def time_read_excel(self, engine):
fname = self.fname_odf if engine == "odf" else self.fname_excel
read_excel(fname, engine=engine)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,5 @@ dependencies:
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- odfpy # pandas.read_excel
- pyreadstat # pandas.read_spss
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,5 @@ xarray
xlrd
xlsxwriter
xlwt
odfpy
pyreadstat