diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt index 6a94d48ad7a5f..2e903102de7b1 100644 --- a/ci/requirements-2.7.txt +++ b/ci/requirements-2.7.txt @@ -8,6 +8,7 @@ numexpr==2.1 tables==2.3.1 matplotlib==1.1.1 openpyxl==1.6.2 +xlsxwriter==0.4.3 xlrd==0.9.2 patsy==0.1.0 html5lib==1.0b2 diff --git a/ci/requirements-2.7_LOCALE.txt b/ci/requirements-2.7_LOCALE.txt index a7e9d62e3549b..056b63bbb8591 100644 --- a/ci/requirements-2.7_LOCALE.txt +++ b/ci/requirements-2.7_LOCALE.txt @@ -2,6 +2,7 @@ python-dateutil pytz==2013b xlwt==0.7.5 openpyxl==1.6.2 +xlsxwriter==0.4.3 xlrd==0.9.2 numpy==1.6.1 cython==0.19.1 diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt index e907a2fa828f1..b689047019ed7 100644 --- a/ci/requirements-3.2.txt +++ b/ci/requirements-3.2.txt @@ -1,6 +1,7 @@ python-dateutil==2.1 pytz==2013b openpyxl==1.6.2 +xlsxwriter==0.4.3 xlrd==0.9.2 numpy==1.6.2 cython==0.19.1 diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt index eb1e725d98040..326098be5f7f4 100644 --- a/ci/requirements-3.3.txt +++ b/ci/requirements-3.3.txt @@ -1,6 +1,7 @@ python-dateutil==2.1 pytz==2013b openpyxl==1.6.2 +xlsxwriter==0.4.3 xlrd==0.9.2 html5lib==1.0b2 numpy==1.7.1 diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 58c5b54968614..705514ac0c364 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -695,13 +695,13 @@ Writing to an excel file .. ipython:: python - df.to_excel('foo.xlsx', sheet_name='sheet1') + df.to_excel('foo.xlsx', sheet_name='Sheet1') Reading from an excel file .. ipython:: python - pd.read_excel('foo.xlsx', 'sheet1', index_col=None, na_values=['NA']) + pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) .. ipython:: python :suppress: diff --git a/doc/source/install.rst b/doc/source/install.rst index 4472d844c1871..b1dcad9448cfd 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -100,6 +100,8 @@ Optional Dependencies * `openpyxl `__, `xlrd/xlwt `__ * openpyxl version 1.6.1 or higher * Needed for Excel I/O + * `XlsxWriter `__ + * Alternative Excel writer. * `boto `__: necessary for Amazon S3 access. * One of `PyQt4 diff --git a/doc/source/io.rst b/doc/source/io.rst index b9581c37082bf..67492eddbac12 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1654,7 +1654,7 @@ indices to be parsed. .. code-block:: python - read_excel('path_to_file.xls', Sheet1', parse_cols=[0, 2, 3], index_col=None, na_values=['NA']) + read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3], index_col=None, na_values=['NA']) To write a DataFrame object to a sheet of an Excel file, you can use the ``to_excel`` instance method. The arguments are largely the same as ``to_csv`` @@ -1664,7 +1664,7 @@ written. For example: .. code-block:: python - df.to_excel('path_to_file.xlsx', sheet_name='sheet1') + df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') Files with a ``.xls`` extension will be written using ``xlwt`` and those with a ``.xlsx`` extension will be written using ``openpyxl``. @@ -1677,8 +1677,8 @@ one can use the ExcelWriter class, as in the following example: .. code-block:: python writer = ExcelWriter('path_to_file.xlsx') - df1.to_excel(writer, sheet_name='sheet1') - df2.to_excel(writer, sheet_name='sheet2') + df1.to_excel(writer, sheet_name='Sheet1') + df2.to_excel(writer, sheet_name='Sheet2') writer.save() .. _io.excel.writers: @@ -1693,11 +1693,29 @@ Excel writer engines 1. the ``engine`` keyword argument 2. the filename extension (via the default specified in config options) -``pandas`` only supports ``openpyxl`` for ``.xlsx`` and ``.xlsm`` files and -``xlwt`` for ``.xls`` files. If you have multiple engines installed, you can choose the -engine to use by default via the options ``io.excel.xlsx.writer`` and -``io.excel.xls.writer``. +By default ``pandas`` only supports +`openpyxl `__ as a writer for ``.xlsx`` +and ``.xlsm`` files and `xlwt `__ as a writer for +``.xls`` files. If you have multiple engines installed, you can change the +default engine via the ``io.excel.xlsx.writer`` and ``io.excel.xls.writer`` +options. +For example if the optional `XlsxWriter `__ +module is installed you can use it as a xlsx writer engine as follows: + +.. code-block:: python + + # By setting the 'engine' in the DataFrame and Panel 'to_excel()' methods. + df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') + + # By setting the 'engine' in the ExcelWriter constructor. + writer = ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + + # Or via pandas configuration. + from pandas import set_option + set_option('io.excel.xlsx.writer', 'xlsxwriter') + + df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') .. _io.hdf5: diff --git a/doc/source/release.rst b/doc/source/release.rst index f7755afe8caae..b71410a8f7e59 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -113,6 +113,9 @@ Improvements to existing features ``io.excel.xls.writer``. (:issue:`4745`, :issue:`4750`) - ``Panel.to_excel()`` now accepts keyword arguments that will be passed to its ``DataFrame``'s ``to_excel()`` methods. (:issue:`4750`) + - Added XlsxWriter as an optional ``ExcelWriter`` engine. This is about 5x + faster than the default openpyxl xlsx writer and is equivalent in speed + to the xlwt xls writer module. (:issue:`4542`) - allow DataFrame constructor to accept more list-like objects, e.g. list of ``collections.Sequence`` and ``array.Array`` objects (:issue:`3783`,:issue:`4297`, :issue:`4851`), thanks @lgautier diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 75f81d20926a1..20fe33226d7ca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1356,7 +1356,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, tupleize_cols=tupleize_cols) formatter.save() - def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', + def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None): """ @@ -1366,7 +1366,7 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', ---------- excel_writer : string or ExcelWriter object File path or existing ExcelWriter - sheet_name : string, default 'sheet1' + sheet_name : string, default 'Sheet1' Name of sheet which will contain DataFrame na_rep : string, default '' Missing data representation @@ -1397,8 +1397,8 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', to the existing workbook. This can be used to save different DataFrames to one workbook >>> writer = ExcelWriter('output.xlsx') - >>> df1.to_excel(writer,'sheet1') - >>> df2.to_excel(writer,'sheet2') + >>> df1.to_excel(writer,'Sheet1') + >>> df2.to_excel(writer,'Sheet2') >>> writer.save() """ from pandas.io.excel import ExcelWriter diff --git a/pandas/io/excel.py b/pandas/io/excel.py index f34c4f99a856d..6ce8eb697268b 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -596,6 +596,7 @@ def _convert_to_style(cls, style_dict, num_format_str=None): Parameters ---------- style_dict: style dictionary to convert + num_format_str: optional number format string """ import xlwt @@ -611,3 +612,95 @@ def _convert_to_style(cls, style_dict, num_format_str=None): register_writer(_XlwtWriter) + +class _XlsxWriter(ExcelWriter): + engine = 'xlsxwriter' + supported_extensions = ('.xlsx',) + + def __init__(self, path, **engine_kwargs): + # Use the xlsxwriter module as the Excel writer. + import xlsxwriter + + super(_XlsxWriter, self).__init__(path, **engine_kwargs) + + self.book = xlsxwriter.Workbook(path, **engine_kwargs) + + def save(self): + """ + Save workbook to disk. + """ + return self.book.close() + + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + # Write the frame cells using xlsxwriter. + + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.add_worksheet(sheet_name) + self.sheets[sheet_name] = wks + + style_dict = {} + + for cell in cells: + val = _conv_value(cell.val) + + num_format_str = None + if isinstance(cell.val, datetime.datetime): + num_format_str = "YYYY-MM-DD HH:MM:SS" + if isinstance(cell.val, datetime.date): + num_format_str = "YYYY-MM-DD" + + stylekey = json.dumps(cell.style) + if num_format_str: + stylekey += num_format_str + + if stylekey in style_dict: + style = style_dict[stylekey] + else: + style = self._convert_to_style(cell.style, num_format_str) + style_dict[stylekey] = style + + if cell.mergestart is not None and cell.mergeend is not None: + wks.merge_range(startrow + cell.row, + startrow + cell.mergestart, + startcol + cell.col, + startcol + cell.mergeend, + val, style) + else: + wks.write(startrow + cell.row, + startcol + cell.col, + val, style) + + def _convert_to_style(self, style_dict, num_format_str=None): + """ + converts a style_dict to an xlsxwriter format object + Parameters + ---------- + style_dict: style dictionary to convert + num_format_str: optional number format string + """ + if style_dict is None: + return None + + # Create a XlsxWriter format object. + xl_format = self.book.add_format() + + # Map the cell font to XlsxWriter font properties. + if style_dict.get('font'): + font = style_dict['font'] + if font.get('bold'): + xl_format.set_bold() + + # Map the cell borders to XlsxWriter border properties. + if style_dict.get('borders'): + xl_format.set_border() + + if num_format_str is not None: + xl_format.set_num_format(num_format_str) + + return xl_format + +register_writer(_XlsxWriter) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 00536026994c5..94f3e5a8cf746 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -16,9 +16,11 @@ register_writer ) from pandas.util.testing import ensure_clean +from pandas.core.config import set_option, get_option import pandas.util.testing as tm import pandas as pd + def _skip_if_no_xlrd(): try: import xlrd @@ -31,18 +33,25 @@ def _skip_if_no_xlrd(): def _skip_if_no_xlwt(): try: - import xlwt # NOQA + import xlwt # NOQA except ImportError: raise nose.SkipTest('xlwt not installed, skipping') def _skip_if_no_openpyxl(): try: - import openpyxl # NOQA + import openpyxl # NOQA except ImportError: raise nose.SkipTest('openpyxl not installed, skipping') +def _skip_if_no_xlsxwriter(): + try: + import xlsxwriter # NOQA + except ImportError: + raise nose.SkipTest('xlsxwriter not installed, skipping') + + def _skip_if_no_excelsuite(): _skip_if_no_xlrd() _skip_if_no_xlwt() @@ -268,15 +277,22 @@ def test_xlsx_table(self): class ExcelWriterBase(SharedItems): - # test cases to run with different extensions - # for each writer - # to add a writer test, define two things: - # 1. a check_skip function that skips your tests if your writer isn't - # installed - # 2. add a property ext, which is the file extension that your writer writes to + # Base class for test cases to run with different Excel writers. + # To add a writer test, define the following: + # 1. A check_skip function that skips your tests if your writer isn't + # installed. + # 2. Add a property ext, which is the file extension that your writer + # writes to. + # 3. Add a property engine_name, which is the name of the writer class. def setUp(self): self.check_skip() super(ExcelWriterBase, self).setUp() + self.option_name = 'io.excel.%s.writer' % self.ext + self.prev_engine = get_option(self.option_name) + set_option(self.option_name, self.engine_name) + + def tearDown(self): + set_option(self.option_name, self.prev_engine) def test_excel_sheet_by_name_raise(self): _skip_if_no_xlrd() @@ -790,6 +806,7 @@ def roundtrip(df, header=True, parser_hdr=0): class OpenpyxlTests(ExcelWriterBase, unittest.TestCase): ext = 'xlsx' + engine_name = 'openpyxl' check_skip = staticmethod(_skip_if_no_openpyxl) def test_to_excel_styleconverter(self): @@ -820,6 +837,7 @@ def test_to_excel_styleconverter(self): class XlwtTests(ExcelWriterBase, unittest.TestCase): ext = 'xls' + engine_name = 'xlwt' check_skip = staticmethod(_skip_if_no_xlwt) def test_to_excel_styleconverter(self): @@ -841,6 +859,52 @@ def test_to_excel_styleconverter(self): self.assertEquals(xlwt.Borders.THIN, xls_style.borders.left) self.assertEquals(xlwt.Alignment.HORZ_CENTER, xls_style.alignment.horz) + +class XlsxWriterTests(ExcelWriterBase, unittest.TestCase): + ext = 'xlsx' + engine_name = 'xlsxwriter' + check_skip = staticmethod(_skip_if_no_xlsxwriter) + + # Override test from the Superclass to use assertAlmostEqual on the + # floating point values read back in from the output XlsxWriter file. + def test_roundtrip_indexlabels(self): + _skip_if_no_xlrd() + ext = self.ext + path = '__tmp_to_excel_from_excel_indexlabels__.' + ext + + with ensure_clean(path) as path: + + self.frame['A'][:5] = nan + + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', cols=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + + # test index_label + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(path, 'test1', index_label=['test']) + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0).astype(np.int64) + frame.index.names = ['test'] + self.assertEqual(frame.index.names, recons.index.names) + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel( + path, 'test1', index_label=['test', 'dummy', 'dummy2']) + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0).astype(np.int64) + frame.index.names = ['test'] + self.assertEqual(frame.index.names, recons.index.names) + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(path, 'test1', index_label='test') + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0).astype(np.int64) + frame.index.names = ['test'] + self.assertAlmostEqual(frame.index.names, recons.index.names) + + class ExcelWriterEngineTests(unittest.TestCase): def test_ExcelWriter_dispatch(self): with tm.assertRaisesRegexp(ValueError, 'No engine'): diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index fc86a78ea684b..d725a3ff6f135 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1429,6 +1429,26 @@ def test_to_excel(self): recdf = reader.parse(str(item), index_col=0) assert_frame_equal(df, recdf) + def test_to_excel_xlsxwriter(self): + try: + import xlrd + import xlsxwriter + from pandas.io.excel import ExcelFile + except ImportError: + raise nose.SkipTest("Requires xlrd and xlsxwriter. Skipping test.") + + path = '__tmp__.xlsx' + with ensure_clean(path) as path: + self.panel.to_excel(path, engine='xlsxwriter') + try: + reader = ExcelFile(path) + except ImportError: + raise nose.SkipTest + + for item, df in compat.iteritems(self.panel): + recdf = reader.parse(str(item), index_col=0) + assert_frame_equal(df, recdf) + def test_dropna(self): p = Panel(np.random.randn(4, 5, 6), major_axis=list('abcde')) p.ix[:, ['b', 'd'], 0] = np.nan diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index b7b4a936a1e90..d9c642372a9bb 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -104,6 +104,12 @@ def show_versions(): except: print("xlwt: Not installed") + try: + import xlsxwriter + print("xlsxwriter: %s" % xlsxwriter.__version__) + except: + print("xlsxwriter: Not installed") + try: import sqlalchemy print("sqlalchemy: %s" % sqlalchemy.__version__)