Skip to content

Improved benchmark coverage for reading spreadsheets #28230

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 5, 2019
1 change: 1 addition & 0 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"xlsxwriter": [],
"xlrd": [],
"xlwt": [],
"pip+odfpy": [],
"pytest": [],
// If using Windows with python 2.7 and want to build using the
// mingw toolchain (rather than MSVC), uncomment the following line.
Expand Down
73 changes: 56 additions & 17 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,30 @@
import numpy as np
from pandas import DataFrame, date_range, ExcelWriter, read_excel
import pandas.util.testing as tm
from odf.opendocument import OpenDocumentSpreadsheet
from odf.text import P
from odf.table import Table, TableRow, TableCell


class Excel:
def _generate_dataframe():
N = 2000
C = 5
df = DataFrame(
np.random.randn(N, C),
columns=["float{}".format(i) for i in range(C)],
index=date_range("20000101", periods=N, freq="H"),
)
df["object"] = tm.makeStringIndex(N)
return df


class WriteExcel:

params = ["openpyxl", "xlsxwriter", "xlwt"]
param_names = ["engine"]

def setup(self, engine):
N = 2000
C = 5
self.df = DataFrame(
np.random.randn(N, C),
columns=["float{}".format(i) for i in range(C)],
index=date_range("20000101", periods=N, freq="H"),
)
self.df["object"] = tm.makeStringIndex(N)
self.bio_read = BytesIO()
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
self.df.to_excel(self.writer_read, sheet_name="Sheet1")
self.writer_read.save()
self.bio_read.seek(0)

def time_read_excel(self, engine):
read_excel(self.bio_read)
self.df = _generate_dataframe()

def time_write_excel(self, engine):
bio_write = BytesIO()
Expand All @@ -35,4 +35,43 @@ def time_write_excel(self, engine):
writer_write.save()


class ReadExcel:

params = ["xlrd", "openpyxl", "odf"]
param_names = ["engine"]

def _generate_odf(self):
doc = OpenDocumentSpreadsheet()
table = Table(name="Table1")
for row in self.df.values:
tr = TableRow()
for val in row:
tc = TableCell(valuetype='string')
tc.addElement(P(text=val))
tr.addElement(tc)
table.addElement(tr)

doc.spreadsheet.addElement(table)

return doc

def setup(self, engine):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know how much time the setup process here takes? Wonder if this should be setup_cache instead to get any write times out of the read benchmark

Copy link
Contributor Author

@f6v f6v Sep 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was north of 1 second, but not more than 1.2, I think. I've replaced setup with setup_cache. Makes much more sense, of course.

self.df = _generate_dataframe()

self.bio_read = BytesIO()
self.writer_read = ExcelWriter(self.bio_read)
self.df.to_excel(self.writer_read, sheet_name="Sheet1")
self.writer_read.save()
self.bio_read.seek(0)

self.bio_read_odf = BytesIO()
odf_doc = self._generate_odf()
odf_doc.write(self.bio_read_odf)
self.bio_read_odf.seek(0)

def time_read_excel(self, engine):
bio = self.bio_read_odf if engine == "odf" else self.bio_read
read_excel(bio, engine=engine)


from ..pandas_vb_common import setup # noqa: F401
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ Other
^^^^^

- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`)
-
- Improved asv benchmark for reading excel files, it now runs for all supported engines

.. _whatsnew_0.252.contributors:

Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,5 @@ dependencies:
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- odfpy
- pyreadstat # pandas.read_spss