Skip to content

Commit 9220309

Browse files
Kevin Sheppardbashtage
Kevin Sheppard
authored andcommitted
ENH: Enable ExcelWriter to construct in-memory sheets
Add support for StringIO/BytesIO to ExcelWriter Add vbench support for writing excel files Add support for serializing lists/dicts to strings Fix bug when reading blank excel sheets Added xlwt to Python 3.4 builds closes pandas-dev#8188 closes pandas-dev#7074 closes pandas-dev#6403 closes pandas-dev#7171 closes pandas-dev#6947
1 parent 0b74c72 commit 9220309

13 files changed

+179
-14
lines changed

ci/requirements-3.4.txt

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ pytz
33
openpyxl
44
xlsxwriter
55
xlrd
6+
xlwt
67
html5lib
78
patsy
89
beautiful-soup

ci/requirements-3.4_SLOW.txt

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ pytz
33
openpyxl
44
xlsxwriter
55
xlrd
6+
xlwt
67
html5lib
78
patsy
89
beautiful-soup

doc/source/install.rst

+1-2
Original file line numberDiff line numberDiff line change
@@ -249,10 +249,9 @@ Optional Dependencies
249249
* `statsmodels <http://statsmodels.sourceforge.net/>`__
250250
* Needed for parts of :mod:`pandas.stats`
251251
* `openpyxl <http://packages.python.org/openpyxl/>`__, `xlrd/xlwt <http://www.python-excel.org/>`__
252-
* openpyxl version 1.6.1 or higher, but lower than 2.0.0
253252
* Needed for Excel I/O
254253
* `XlsxWriter <https://pypi.python.org/pypi/XlsxWriter>`__
255-
* Alternative Excel writer.
254+
* Alternative Excel writer
256255
* `boto <https://pypi.python.org/pypi/boto>`__: necessary for Amazon S3
257256
access.
258257
* `blosc <https://pypi.python.org/pypi/blosc>`__: for msgpack compression using ``blosc``

doc/source/io.rst

+42-1
Original file line numberDiff line numberDiff line change
@@ -2130,7 +2130,9 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
21302130
df1.to_excel(writer, sheet_name='Sheet1')
21312131
df2.to_excel(writer, sheet_name='Sheet2')
21322132
2133-
.. note:: Wringing a little more performance out of ``read_excel``
2133+
.. note::
2134+
2135+
Wringing a little more performance out of ``read_excel``
21342136
Internally, Excel stores all numeric data as floats. Because this can
21352137
produce unexpected behavior when reading in data, pandas defaults to trying
21362138
to convert integers to floats if it doesn't lose information (``1.0 -->
@@ -2182,6 +2184,45 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are:
21822184
21832185
df.to_excel('path_to_file.xlsx', sheet_name='Sheet1')
21842186
2187+
Writing Excel Files to Memory
2188+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2189+
2190+
.. versionadded:: 0.17
2191+
2192+
.. _io.excel_writing_buffer
2193+
2194+
Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or
2195+
``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`.
2196+
2197+
.. code-block:: python
2198+
2199+
# Safe import for either Python 2.x or 3.x
2200+
try:
2201+
from io import BytesIO
2202+
except ImportError:
2203+
from cStringIO import StringIO as BytesIO
2204+
2205+
bio = BytesIO()
2206+
2207+
# By setting the 'engine' in the ExcelWriter constructor.
2208+
writer = ExcelWriter(bio, engine='xlsxwriter')
2209+
df.to_excel(writer, sheet_name='Sheet1')
2210+
2211+
# Save the workbook
2212+
writer.save()
2213+
2214+
# Seek to the beginning and read to copy the workbook to a variable in memory
2215+
bio.seek(0)
2216+
workbook = bio.read()
2217+
2218+
.. note::
2219+
2220+
``engine`` is optional but recommended. Setting the engine determines
2221+
the version of workbook produced. Setting ``engine='xlrd'`` will produce an
2222+
Excel 2003-format workbook (xls). Using either ``'openpyxl'`` or
2223+
``'xlsxwriter'`` will produce an Excel 2007-format workbook (xlsx). If
2224+
omitted, an Excel 2007-formatted workbook is produced.
2225+
21852226
.. _io.clipboard:
21862227

21872228
Clipboard

doc/source/whatsnew/v0.17.0.txt

+6
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ Backwards incompatible API changes
3838

3939
Other API Changes
4040
^^^^^^^^^^^^^^^^^
41+
- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
42+
- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
4143

4244
.. _whatsnew_0170.deprecations:
4345

@@ -53,11 +55,15 @@ Removal of prior version deprecations/changes
5355

5456
Performance Improvements
5557
~~~~~~~~~~~~~~~~~~~~~~~~
58+
- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
5659

5760
.. _whatsnew_0170.bug_fixes:
5861

5962
Bug Fixes
6063
~~~~~~~~~
6164
- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
65+
66+
6267
- Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`)
6368
- Bug in ``DataFrame.reset_index`` when index contains `NaT`. (:issue:`10388`)
69+
- Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`)

pandas/core/frame.py

+3
Original file line numberDiff line numberDiff line change
@@ -1246,6 +1246,9 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
12461246
>>> df1.to_excel(writer,'Sheet1')
12471247
>>> df2.to_excel(writer,'Sheet2')
12481248
>>> writer.save()
1249+
1250+
For compatibility with to_csv, to_excel serializes lists and dicts to
1251+
strings before writing.
12491252
"""
12501253
from pandas.io.excel import ExcelWriter
12511254
if self.columns.nlevels > 1:

pandas/io/excel.py

+30-8
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99
import abc
1010
import numpy as np
1111

12+
from pandas.core.frame import DataFrame
1213
from pandas.io.parsers import TextParser
1314
from pandas.io.common import _is_url, _urlopen
1415
from pandas.tseries.period import Period
1516
from pandas import json
16-
from pandas.compat import map, zip, reduce, range, lrange, u, add_metaclass
17+
from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
18+
BytesIO, string_types)
1719
from pandas.core import config
1820
from pandas.core.common import pprint_thing
1921
import pandas.compat as compat
@@ -417,10 +419,13 @@ def _parse_cell(cell_contents,cell_typ):
417419
if parse_cols is None or should_parse[j]:
418420
row.append(_parse_cell(value,typ))
419421
data.append(row)
420-
422+
423+
if sheet.nrows == 0:
424+
return DataFrame()
425+
421426
if header is not None:
422427
data[header] = _trim_excel_header(data[header])
423-
428+
424429
parser = TextParser(data, header=header, index_col=index_col,
425430
has_index_names=has_index_names,
426431
na_values=na_values,
@@ -474,6 +479,8 @@ def _conv_value(val):
474479
val = bool(val)
475480
elif isinstance(val, Period):
476481
val = "%s" % val
482+
elif com.is_list_like(val):
483+
val = str(val)
477484

478485
return val
479486

@@ -497,6 +504,11 @@ class ExcelWriter(object):
497504
datetime_format : string, default None
498505
Format string for datetime objects written into Excel files
499506
(e.g. 'YYYY-MM-DD HH:MM:SS')
507+
508+
Notes
509+
-----
510+
For compatibility with CSV writers, ExcelWriter serializes lists
511+
and dicts to strings before writing.
500512
"""
501513
# Defining an ExcelWriter implementation (see abstract methods for more...)
502514

@@ -521,9 +533,13 @@ class ExcelWriter(object):
521533
# ExcelWriter.
522534
def __new__(cls, path, engine=None, **kwargs):
523535
# only switch class if generic(ExcelWriter)
524-
if cls == ExcelWriter:
536+
if issubclass(cls, ExcelWriter):
525537
if engine is None:
526-
ext = os.path.splitext(path)[-1][1:]
538+
if isinstance(path, string_types):
539+
ext = os.path.splitext(path)[-1][1:]
540+
else:
541+
ext = 'xlsx'
542+
527543
try:
528544
engine = config.get_option('io.excel.%s.writer' % ext)
529545
except KeyError:
@@ -574,7 +590,11 @@ def save(self):
574590
def __init__(self, path, engine=None,
575591
date_format=None, datetime_format=None, **engine_kwargs):
576592
# validate that this engine can handle the extension
577-
ext = os.path.splitext(path)[-1]
593+
if isinstance(path, string_types):
594+
ext = os.path.splitext(path)[-1]
595+
else:
596+
ext = 'xls' if engine == 'xlwt' else 'xlsx'
597+
578598
self.check_extension(ext)
579599

580600
self.path = path
@@ -1159,7 +1179,7 @@ class _XlwtWriter(ExcelWriter):
11591179
def __init__(self, path, engine=None, encoding=None, **engine_kwargs):
11601180
# Use the xlwt module as the Excel writer.
11611181
import xlwt
1162-
1182+
engine_kwargs['engine'] = engine
11631183
super(_XlwtWriter, self).__init__(path, **engine_kwargs)
11641184

11651185
if encoding is None:
@@ -1311,6 +1331,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0):
13111331
style_dict = {}
13121332

13131333
for cell in cells:
1334+
val = _conv_value(cell.val)
1335+
13141336
num_format_str = None
13151337
if isinstance(cell.val, datetime.datetime):
13161338
num_format_str = self.datetime_format
@@ -1336,7 +1358,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0):
13361358
else:
13371359
wks.write(startrow + cell.row,
13381360
startcol + cell.col,
1339-
cell.val, style)
1361+
val, style)
13401362

13411363
def _convert_to_style(self, style_dict, num_format_str=None):
13421364
"""

pandas/io/tests/data/blank.xls

22.5 KB
Binary file not shown.

pandas/io/tests/data/blank.xlsx

8.18 KB
Binary file not shown.
22.5 KB
Binary file not shown.
8.57 KB
Binary file not shown.

pandas/io/tests/test_excel.py

+51-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# pylint: disable=E1101
22

3-
from pandas.compat import u, range, map, openpyxl_compat
3+
from pandas.compat import u, range, map, openpyxl_compat, BytesIO, iteritems
44
from datetime import datetime, date, time
55
import sys
66
import os
@@ -455,7 +455,7 @@ def test_reading_multiple_specific_sheets(self):
455455
def test_creating_and_reading_multiple_sheets(self):
456456
# Test reading multiple sheets, from a runtime created excel file
457457
# with multiple sheets.
458-
# See PR #9450
458+
# See PR #9450
459459

460460
_skip_if_no_xlrd()
461461
_skip_if_no_xlwt()
@@ -471,7 +471,7 @@ def tdf(sheetname):
471471

472472
with ensure_clean('.xlsx') as pth:
473473
with ExcelWriter(pth) as ew:
474-
for sheetname, df in dfs.iteritems():
474+
for sheetname, df in iteritems(dfs):
475475
df.to_excel(ew,sheetname)
476476
dfs_returned = pd.read_excel(pth,sheetname=sheets)
477477
for s in sheets:
@@ -520,6 +520,29 @@ def test_reader_seconds(self):
520520
actual = read_excel(epoch_1904, 'Sheet1')
521521
tm.assert_frame_equal(actual, expected)
522522

523+
# GH6403
524+
def test_read_excel_blank(self):
525+
_skip_if_no_xlrd()
526+
527+
blank = os.path.join(self.dirpath, 'blank.xls')
528+
actual = read_excel(blank, 'Sheet1')
529+
tm.assert_frame_equal(actual, DataFrame())
530+
531+
blank = os.path.join(self.dirpath, 'blank.xlsx')
532+
actual = read_excel(blank, 'Sheet1')
533+
tm.assert_frame_equal(actual, DataFrame())
534+
535+
def test_read_excel_blank_with_header(self):
536+
_skip_if_no_xlrd()
537+
538+
expected = DataFrame(columns=['col_1', 'col_2'])
539+
blank = os.path.join(self.dirpath, 'blank_with_header.xls')
540+
actual = read_excel(blank, 'Sheet1')
541+
tm.assert_frame_equal(actual, expected)
542+
543+
blank = os.path.join(self.dirpath, 'blank_with_header.xlsx')
544+
actual = read_excel(blank, 'Sheet1')
545+
tm.assert_frame_equal(actual, expected)
523546

524547
class ExcelWriterBase(SharedItems):
525548
# Base class for test cases to run with different Excel writers.
@@ -1218,6 +1241,30 @@ def test_datetimes(self):
12181241

12191242
tm.assert_series_equal(write_frame['A'], read_frame['A'])
12201243

1244+
# GH7074
1245+
def test_bytes_io(self):
1246+
bio = BytesIO()
1247+
df = DataFrame(np.random.randn(10, 2))
1248+
writer = ExcelWriter(bio)
1249+
df.to_excel(writer)
1250+
writer.save()
1251+
bio.seek(0)
1252+
reread_df = pd.read_excel(bio)
1253+
tm.assert_frame_equal(df, reread_df)
1254+
1255+
# GH8188
1256+
def test_write_lists_dict(self):
1257+
df = pd.DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}],
1258+
'numeric': [1, 2, 3.0],
1259+
'str': ['apple', 'banana', 'cherry']})
1260+
expected = df.copy()
1261+
expected.mixed = expected.mixed.apply(str)
1262+
expected.numeric = expected.numeric.astype('int64')
1263+
with ensure_clean(self.ext) as path:
1264+
df.to_excel(path, 'Sheet1')
1265+
read = read_excel(path, 'Sheet1', header=0)
1266+
tm.assert_frame_equal(read, expected)
1267+
12211268
def raise_wrapper(major_ver):
12221269
def versioned_raise_wrapper(orig_method):
12231270
@functools.wraps(orig_method)
@@ -1512,6 +1559,7 @@ class XlsxWriterTests_NoMerge(ExcelWriterBase, tm.TestCase):
15121559

15131560

15141561
class ExcelWriterEngineTests(tm.TestCase):
1562+
15151563
def test_ExcelWriter_dispatch(self):
15161564
with tm.assertRaisesRegexp(ValueError, 'No engine'):
15171565
ExcelWriter('nothing')

vb_suite/packers.py

+44
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import pandas as pd
99
from pandas.core import common as com
10+
from pandas.compat import BytesIO
1011
from random import randrange
1112
1213
f = '__test__.msg'
@@ -206,3 +207,46 @@ def remove(f):
206207
packers_read_stata_with_validation = Benchmark("pd.read_stata(f)", setup, start_date=start_date)
207208

208209
packers_write_stata_with_validation = Benchmark("df.to_stata(f, {'index': 'tc'})", setup, cleanup="remove(f)", start_date=start_date)
210+
211+
#----------------------------------------------------------------------
212+
# Excel - alternative writers
213+
setup = common_setup + """
214+
bio = BytesIO()
215+
"""
216+
217+
excel_writer_bench = """
218+
bio.seek(0)
219+
writer = pd.io.excel.ExcelWriter(bio, engine='{engine}')
220+
df[:2000].to_excel(writer)
221+
writer.save()
222+
"""
223+
224+
benchmark_xlsxwriter = excel_writer_bench.format(engine='xlsxwriter')
225+
226+
packers_write_excel_xlsxwriter = Benchmark(benchmark_xlsxwriter, setup)
227+
228+
benchmark_openpyxl = excel_writer_bench.format(engine='openpyxl')
229+
230+
packers_write_excel_openpyxl = Benchmark(benchmark_openpyxl, setup)
231+
232+
benchmark_xlwt = excel_writer_bench.format(engine='xlwt')
233+
234+
packers_write_excel_xlwt = Benchmark(benchmark_xlwt, setup)
235+
236+
237+
#----------------------------------------------------------------------
238+
# Excel - reader
239+
240+
setup = common_setup + """
241+
bio = BytesIO()
242+
writer = pd.io.excel.ExcelWriter(bio, engine='xlsxwriter')
243+
df[:2000].to_excel(writer)
244+
writer.save()
245+
"""
246+
247+
benchmark_read_excel="""
248+
bio.seek(0)
249+
pd.read_excel(bio)
250+
"""
251+
252+
packers_read_excel = Benchmark(benchmark_read_excel, setup)

0 commit comments

Comments
 (0)