-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Improved benchmark coverage for reading spreadsheets #28230
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
b0ffdc9
Improved benchmark coverage for reading spreadsheets
f6v 7c8c1f3
Added blank lines
f6v 6e421e3
More blank lines
f6v f7500c6
Updated whatsnew
f6v 9dcf7a9
- Removed whatsnew entry
f6v 1354f04
Updated requirements-dev.txt
f6v 4a0238f
Fixed imports order
f6v 682711b
Merge branch 'master' into improve_benchmark
f6v 96912d6
Fixed imports again
f6v 7800665
Merge branch 'improve_benchmark' of github.com:f6v/pandas into improv…
f6v e7d6986
Run black
f6v e7279e5
Changed conda channels order in ASV config
f6v a4b3cc2
Used setup_cache to speed up read benchmark
f6v File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,72 @@ | ||
from io import BytesIO | ||
|
||
import numpy as np | ||
from odf.opendocument import OpenDocumentSpreadsheet | ||
from odf.table import Table, TableCell, TableRow | ||
from odf.text import P | ||
|
||
from pandas import DataFrame, ExcelWriter, date_range, read_excel | ||
import pandas.util.testing as tm | ||
|
||
|
||
class Excel: | ||
def _generate_dataframe(): | ||
N = 2000 | ||
C = 5 | ||
df = DataFrame( | ||
np.random.randn(N, C), | ||
columns=["float{}".format(i) for i in range(C)], | ||
index=date_range("20000101", periods=N, freq="H"), | ||
) | ||
df["object"] = tm.makeStringIndex(N) | ||
return df | ||
|
||
|
||
class WriteExcel: | ||
|
||
params = ["openpyxl", "xlsxwriter", "xlwt"] | ||
param_names = ["engine"] | ||
|
||
def setup(self, engine): | ||
N = 2000 | ||
C = 5 | ||
self.df = DataFrame( | ||
np.random.randn(N, C), | ||
columns=["float{}".format(i) for i in range(C)], | ||
index=date_range("20000101", periods=N, freq="H"), | ||
) | ||
self.df["object"] = tm.makeStringIndex(N) | ||
self.bio_read = BytesIO() | ||
self.writer_read = ExcelWriter(self.bio_read, engine=engine) | ||
self.df.to_excel(self.writer_read, sheet_name="Sheet1") | ||
self.writer_read.save() | ||
self.bio_read.seek(0) | ||
|
||
def time_read_excel(self, engine): | ||
read_excel(self.bio_read) | ||
self.df = _generate_dataframe() | ||
|
||
def time_write_excel(self, engine): | ||
bio_write = BytesIO() | ||
bio_write.seek(0) | ||
writer_write = ExcelWriter(bio_write, engine=engine) | ||
self.df.to_excel(writer_write, sheet_name="Sheet1") | ||
writer_write.save() | ||
bio = BytesIO() | ||
bio.seek(0) | ||
writer = ExcelWriter(bio, engine=engine) | ||
self.df.to_excel(writer, sheet_name="Sheet1") | ||
writer.save() | ||
|
||
|
||
class ReadExcel: | ||
|
||
params = ["xlrd", "openpyxl", "odf"] | ||
param_names = ["engine"] | ||
fname_excel = "spreadsheet.xlsx" | ||
fname_odf = "spreadsheet.ods" | ||
|
||
def _create_odf(self): | ||
f6v marked this conversation as resolved.
Show resolved
Hide resolved
|
||
doc = OpenDocumentSpreadsheet() | ||
table = Table(name="Table1") | ||
for row in self.df.values: | ||
tr = TableRow() | ||
for val in row: | ||
tc = TableCell(valuetype="string") | ||
tc.addElement(P(text=val)) | ||
tr.addElement(tc) | ||
table.addElement(tr) | ||
|
||
doc.spreadsheet.addElement(table) | ||
doc.save(self.fname_odf) | ||
|
||
def setup_cache(self): | ||
self.df = _generate_dataframe() | ||
|
||
self.df.to_excel(self.fname_excel, sheet_name="Sheet1") | ||
self._create_odf() | ||
|
||
def time_read_excel(self, engine): | ||
fname = self.fname_odf if engine == "odf" else self.fname_excel | ||
read_excel(fname, engine=engine) | ||
|
||
|
||
from ..pandas_vb_common import setup # noqa: F401 isort:skip |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,4 +54,5 @@ xarray | |
xlrd | ||
xlsxwriter | ||
xlwt | ||
odfpy | ||
pyreadstat |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any reason for changing this? I think self-contained
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The naming was obsolete. Before the PR, all the setup was done in one function for both read and write benchmarks. Hence the variable names having postfixes
_read
and_write
were justified, but not anymore, I guess?