-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Improved benchmark coverage for reading spreadsheets #28230
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
b0ffdc9
7c8c1f3
6e421e3
f7500c6
9dcf7a9
1354f04
4a0238f
682711b
96912d6
7800665
e7d6986
e7279e5
a4b3cc2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,71 @@ | ||
from io import BytesIO | ||
|
||
import numpy as np | ||
from odf.opendocument import OpenDocumentSpreadsheet | ||
from odf.table import Table, TableCell, TableRow | ||
from odf.text import P | ||
|
||
from pandas import DataFrame, ExcelWriter, date_range, read_excel | ||
import pandas.util.testing as tm | ||
|
||
|
||
class Excel: | ||
def _generate_dataframe(): | ||
N = 2000 | ||
C = 5 | ||
df = DataFrame( | ||
np.random.randn(N, C), | ||
columns=["float{}".format(i) for i in range(C)], | ||
index=date_range("20000101", periods=N, freq="H"), | ||
) | ||
df["object"] = tm.makeStringIndex(N) | ||
return df | ||
|
||
|
||
class WriteExcel: | ||
|
||
params = ["openpyxl", "xlsxwriter", "xlwt"] | ||
param_names = ["engine"] | ||
|
||
def setup(self, engine): | ||
N = 2000 | ||
C = 5 | ||
self.df = DataFrame( | ||
np.random.randn(N, C), | ||
columns=["float{}".format(i) for i in range(C)], | ||
index=date_range("20000101", periods=N, freq="H"), | ||
) | ||
self.df["object"] = tm.makeStringIndex(N) | ||
self.bio_read = BytesIO() | ||
self.writer_read = ExcelWriter(self.bio_read, engine=engine) | ||
self.df.to_excel(self.writer_read, sheet_name="Sheet1") | ||
self.writer_read.save() | ||
self.bio_read.seek(0) | ||
|
||
def time_read_excel(self, engine): | ||
read_excel(self.bio_read) | ||
self.df = _generate_dataframe() | ||
|
||
def time_write_excel(self, engine): | ||
bio_write = BytesIO() | ||
bio_write.seek(0) | ||
writer_write = ExcelWriter(bio_write, engine=engine) | ||
self.df.to_excel(writer_write, sheet_name="Sheet1") | ||
writer_write.save() | ||
bio = BytesIO() | ||
bio.seek(0) | ||
writer = ExcelWriter(bio, engine=engine) | ||
self.df.to_excel(writer, sheet_name="Sheet1") | ||
writer.save() | ||
|
||
|
||
class ReadExcel: | ||
|
||
params = ["xlrd", "openpyxl", "odf"] | ||
param_names = ["engine"] | ||
|
||
def _create_odf(self): | ||
f6v marked this conversation as resolved.
Show resolved
Hide resolved
|
||
doc = OpenDocumentSpreadsheet() | ||
table = Table(name="Table1") | ||
for row in self.df.values: | ||
tr = TableRow() | ||
for val in row: | ||
tc = TableCell(valuetype="string") | ||
tc.addElement(P(text=val)) | ||
tr.addElement(tc) | ||
table.addElement(tr) | ||
|
||
doc.spreadsheet.addElement(table) | ||
doc.save(self.fname_odf) | ||
|
||
def setup(self, engine): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know how much time the setup process here takes? Wonder if this should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was north of 1 second, but not more than 1.2, I think. I've replaced |
||
self.df = _generate_dataframe() | ||
self.fname_excel = "spreadsheet.xlsx" | ||
mroeschke marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self.fname_odf = "spreadsheet.ods" | ||
self.df.to_excel(self.fname_excel, sheet_name="Sheet1") | ||
self._create_odf() | ||
|
||
def time_read_excel(self, engine): | ||
fname = self.fname_odf if engine == "odf" else self.fname_excel | ||
read_excel(fname, engine=engine) | ||
|
||
|
||
from ..pandas_vb_common import setup # noqa: F401 isort:skip |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,4 +54,5 @@ xarray | |
xlrd | ||
xlsxwriter | ||
xlwt | ||
odfpy | ||
pyreadstat |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any reason for changing this? I think self-contained
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The naming was obsolete. Before the PR, all the setup was done in one function for both read and write benchmarks. Hence the variable names having postfixes
_read
and_write
were justified, but not anymore, I guess?