diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 571ede1a21134..c04bbf53a86a6 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -50,12 +50,13 @@ "xlsxwriter": [], "xlrd": [], "xlwt": [], + "odfpy": [], "pytest": [], // If using Windows with python 2.7 and want to build using the // mingw toolchain (rather than MSVC), uncomment the following line. // "libpython": [], }, - + "conda_channels": ["defaults", "conda-forge"], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 9aa5cbd5b6f7c..c97cf768e27d9 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,40 +1,72 @@ from io import BytesIO import numpy as np +from odf.opendocument import OpenDocumentSpreadsheet +from odf.table import Table, TableCell, TableRow +from odf.text import P from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm -class Excel: +def _generate_dataframe(): + N = 2000 + C = 5 + df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + df["object"] = tm.makeStringIndex(N) + return df + + +class WriteExcel: params = ["openpyxl", "xlsxwriter", "xlwt"] param_names = ["engine"] def setup(self, engine): - N = 2000 - C = 5 - self.df = DataFrame( - np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), - ) - self.df["object"] = tm.makeStringIndex(N) - self.bio_read = BytesIO() - self.writer_read = ExcelWriter(self.bio_read, engine=engine) - self.df.to_excel(self.writer_read, sheet_name="Sheet1") - self.writer_read.save() - self.bio_read.seek(0) - - def time_read_excel(self, engine): - read_excel(self.bio_read) + self.df = _generate_dataframe() def time_write_excel(self, engine): - bio_write = BytesIO() - bio_write.seek(0) - writer_write = ExcelWriter(bio_write, engine=engine) - self.df.to_excel(writer_write, sheet_name="Sheet1") - writer_write.save() + bio = BytesIO() + bio.seek(0) + writer = ExcelWriter(bio, engine=engine) + self.df.to_excel(writer, sheet_name="Sheet1") + writer.save() + + +class ReadExcel: + + params = ["xlrd", "openpyxl", "odf"] + param_names = ["engine"] + fname_excel = "spreadsheet.xlsx" + fname_odf = "spreadsheet.ods" + + def _create_odf(self): + doc = OpenDocumentSpreadsheet() + table = Table(name="Table1") + for row in self.df.values: + tr = TableRow() + for val in row: + tc = TableCell(valuetype="string") + tc.addElement(P(text=val)) + tr.addElement(tc) + table.addElement(tr) + + doc.spreadsheet.addElement(table) + doc.save(self.fname_odf) + + def setup_cache(self): + self.df = _generate_dataframe() + + self.df.to_excel(self.fname_excel, sheet_name="Sheet1") + self._create_odf() + + def time_read_excel(self, engine): + fname = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine) from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/environment.yml b/environment.yml index 6d2cd701c3854..d72972ffc4da4 100644 --- a/environment.yml +++ b/environment.yml @@ -80,4 +80,5 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - odfpy # pandas.read_excel - pyreadstat # pandas.read_spss diff --git a/requirements-dev.txt b/requirements-dev.txt index cf11a3ee28258..c0fb9ee331b11 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -54,4 +54,5 @@ xarray xlrd xlsxwriter xlwt +odfpy pyreadstat \ No newline at end of file