From 917e9567108e174d453fdf3404e3eca2eaefbaa8 Mon Sep 17 00:00:00 2001 From: John McNamara Date: Sun, 6 Oct 2013 20:15:01 +0100 Subject: [PATCH 1/4] ENH: Support for PyExcelerate as an Excel writer engine. --- .travis.yml | 1 + doc/source/install.rst | 4 +-- doc/source/release.rst | 2 ++ pandas/io/excel.py | 48 +++++++++++++++++++++++++ pandas/io/tests/test_excel.py | 68 +++++++++++++++++++++++++++++++++++ pandas/tests/test_panel.py | 22 ++++++++++++ 6 files changed, 143 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 387dec1ed2658..1a44fdcef5f83 100644 --- a/.travis.yml +++ b/.travis.yml @@ -41,6 +41,7 @@ before_install: install: - echo "Waldo2" - ci/install.sh + - pip install git+git://github.com/jmcnamara/PyExcelerate@pandas before_script: - mysql -e 'create database pandas_nosetest;' diff --git a/doc/source/install.rst b/doc/source/install.rst index 532c90b83ebb0..4beea77c716ed 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -101,8 +101,8 @@ Optional Dependencies * `openpyxl `__, `xlrd/xlwt `__ * openpyxl version 1.6.1 or higher * Needed for Excel I/O - * `XlsxWriter `__ - * Alternative Excel writer. + * `XlsxWriter `__, `PyExcelerate `__ + * Alternative Excel writers. * `boto `__: necessary for Amazon S3 access. * One of `PyQt4 diff --git a/doc/source/release.rst b/doc/source/release.rst index 8488d03f97cbd..a5c55eaea9e2b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -136,6 +136,8 @@ Improvements to existing features - Added XlsxWriter as an optional ``ExcelWriter`` engine. This is about 5x faster than the default openpyxl xlsx writer and is equivalent in speed to the xlwt xls writer module. (:issue:`4542`) + - Added PyExcelerate as an optional ``ExcelWriter`` engine. This is about + 14x faster than the default openpyxl xlsx writer. - allow DataFrame constructor to accept more list-like objects, e.g. list of ``collections.Sequence`` and ``array.Array`` objects (:issue:`3783`, :issue:`4297`, :issue:`4851`), thanks @lgautier diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 6b83fada19001..2e037c8a60723 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -745,3 +745,51 @@ def _convert_to_style(self, style_dict, num_format_str=None): return xl_format register_writer(_XlsxWriter) + + +class _PyExcelerate(ExcelWriter): + engine = 'pyexcelerate' + supported_extensions = ('.xlsx',) + + def __init__(self, path, **engine_kwargs): + # Use the pyexcelerate module as the Excel writer. + import pyexcelerate + + super(_PyExcelerate, self).__init__(path, **engine_kwargs) + + self.book = pyexcelerate.Workbook(path, **engine_kwargs) + + def save(self): + """ + Save workbook to disk. + """ + return self.book.save(self.path) + + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + # Write the frame cells using pyexcelerate. + + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.new_sheet(sheet_name) + self.sheets[sheet_name] = wks + + for cell in cells: + val = _conv_value(cell.val) + + if cell.mergestart is not None and cell.mergeend is not None: +# wks.merge_range(startrow + cell.row, +# startrow + cell.mergestart, +# startcol + cell.col, +# startcol + cell.mergeend, +# val, style) + pass + else: + # wks[startrow + cell.row][startcol + cell.col] = val + wks[1 + startrow + cell.row][1 + startcol + cell.col] = val + + +register_writer(_PyExcelerate) + diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 38b3ee192ab7a..58942f949a44b 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -52,6 +52,13 @@ def _skip_if_no_xlsxwriter(): raise nose.SkipTest('xlsxwriter not installed, skipping') +def _skip_if_no_pyexcelerate(): + try: + import pyexcelerate # NOQA + except ImportError: + raise nose.SkipTest('pyexcelerate not installed, skipping') + + def _skip_if_no_excelsuite(): _skip_if_no_xlrd() _skip_if_no_xlwt() @@ -953,6 +960,67 @@ def test_roundtrip_indexlabels(self): self.assertAlmostEqual(frame.index.names, recons.index.names) +class PyExcelerateTests(ExcelWriterBase, unittest.TestCase): + ext = 'xlsx' + engine_name = 'pyexcelerate' + check_skip = staticmethod(_skip_if_no_pyexcelerate) + + # Override test from the Superclass to use assertAlmostEqual on the + # floating point values read back in from the output PyExcelerate file. + def test_roundtrip_indexlabels(self): + _skip_if_no_xlrd() + ext = self.ext + path = '__tmp_to_excel_from_excel_indexlabels__.' + ext + + with ensure_clean(path) as path: + + self.frame['A'][:5] = nan + + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', cols=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + + # test index_label + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(path, 'test1', index_label=['test']) + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0).astype(np.int64) + frame.index.names = ['test'] + self.assertEqual(frame.index.names, recons.index.names) + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel( + path, 'test1', index_label=['test', 'dummy', 'dummy2']) + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0).astype(np.int64) + frame.index.names = ['test'] + self.assertEqual(frame.index.names, recons.index.names) + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(path, 'test1', index_label='test') + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0).astype(np.int64) + frame.index.names = ['test'] + self.assertAlmostEqual(frame.index.names, recons.index.names) + + # TODO: Skip these tests until the pyexcelerator date issue is fixed. + def test_excel_roundtrip_datetime(self): + raise nose.SkipTest('pyexcelerator dates not supported') + + def test_sheets(self): + raise nose.SkipTest('pyexcelerator dates not supported') + + def test_to_excel_multiindex_dates(self): + raise nose.SkipTest('pyexcelerator dates not supported') + + def test_to_excel_periodindex(self): + raise nose.SkipTest('pyexcelerator dates not supported') + + def test_tsframe(self): + raise nose.SkipTest('pyexcelerator dates not supported') + + class ExcelWriterEngineTests(unittest.TestCase): def test_ExcelWriter_dispatch(self): with tm.assertRaisesRegexp(ValueError, 'No engine'): diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 07b33266d88a1..172b766649a5a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1595,6 +1595,28 @@ def test_to_excel_xlsxwriter(self): recdf = reader.parse(str(item), index_col=0) assert_frame_equal(df, recdf) + def test_to_excel_pyexcelerate(self): + # TODO: Skip this test until the pyexcelerator date issue is fixed. + raise nose.SkipTest('pyexcelerator dates not supported') + try: + import xlrd + import pyexcelerate + from pandas.io.excel import ExcelFile + except ImportError: + raise nose.SkipTest("Requires xlrd and pyexcelerate. Skipping.") + + path = '__tmp__.xlsx' + with ensure_clean(path) as path: + self.panel.to_excel(path, engine='pyexcelerate') + try: + reader = ExcelFile(path) + except ImportError as e: + raise nose.SkipTest("cannot write excel file: %s" % e) + + for item, df in compat.iteritems(self.panel): + recdf = reader.parse(str(item), index_col=0) + assert_frame_equal(df, recdf) + def test_dropna(self): p = Panel(np.random.randn(4, 5, 6), major_axis=list('abcde')) p.ix[:, ['b', 'd'], 0] = np.nan From a3e5b5b056fb59ea254502b591d929ccd5722bf8 Mon Sep 17 00:00:00 2001 From: John McNamara Date: Mon, 7 Oct 2013 20:07:30 +0100 Subject: [PATCH 2/4] Turn on PyExcelerate date tests. --- .travis.yml | 3 ++- pandas/io/excel.py | 3 +++ pandas/io/tests/test_excel.py | 19 +------------------ pandas/tests/test_panel.py | 2 -- 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1a44fdcef5f83..0fc29d774d6e5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -41,7 +41,8 @@ before_install: install: - echo "Waldo2" - ci/install.sh - - pip install git+git://github.com/jmcnamara/PyExcelerate@pandas + # Temp testing measure while waiting for PyPi release. + - pip install git+git://github.com:kz26/PyExcelerate.git before_script: - mysql -e 'create database pandas_nosetest;' diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 2e037c8a60723..59bbf28d8fabc 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -779,6 +779,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): for cell in cells: val = _conv_value(cell.val) + if isinstance(cell.val, datetime.date): + val = datetime.datetime.fromordinal(val.toordinal()) + if cell.mergestart is not None and cell.mergeend is not None: # wks.merge_range(startrow + cell.row, # startrow + cell.mergestart, diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 58942f949a44b..b04647eb43ae0 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1004,22 +1004,6 @@ def test_roundtrip_indexlabels(self): frame.index.names = ['test'] self.assertAlmostEqual(frame.index.names, recons.index.names) - # TODO: Skip these tests until the pyexcelerator date issue is fixed. - def test_excel_roundtrip_datetime(self): - raise nose.SkipTest('pyexcelerator dates not supported') - - def test_sheets(self): - raise nose.SkipTest('pyexcelerator dates not supported') - - def test_to_excel_multiindex_dates(self): - raise nose.SkipTest('pyexcelerator dates not supported') - - def test_to_excel_periodindex(self): - raise nose.SkipTest('pyexcelerator dates not supported') - - def test_tsframe(self): - raise nose.SkipTest('pyexcelerator dates not supported') - class ExcelWriterEngineTests(unittest.TestCase): def test_ExcelWriter_dispatch(self): @@ -1034,11 +1018,11 @@ def test_ExcelWriter_dispatch(self): writer = ExcelWriter('apple.xls') tm.assert_isinstance(writer, _XlwtWriter) - def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works called_save = [] called_write_cells = [] + class DummyClass(ExcelWriter): called_save = False called_write_cells = False @@ -1066,7 +1050,6 @@ def check_called(func): func = lambda: df.to_excel('something.test') check_called(func) check_called(lambda: panel.to_excel('something.test')) - from pandas import set_option, get_option val = get_option('io.excel.xlsx.writer') set_option('io.excel.xlsx.writer', 'dummy') check_called(lambda: df.to_excel('something.xlsx')) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 172b766649a5a..bf0a37aad67cc 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1596,8 +1596,6 @@ def test_to_excel_xlsxwriter(self): assert_frame_equal(df, recdf) def test_to_excel_pyexcelerate(self): - # TODO: Skip this test until the pyexcelerator date issue is fixed. - raise nose.SkipTest('pyexcelerator dates not supported') try: import xlrd import pyexcelerate From 099e8658f53664b741aa3dfc75c015c40201ad05 Mon Sep 17 00:00:00 2001 From: John McNamara Date: Mon, 7 Oct 2013 23:52:56 +0100 Subject: [PATCH 3/4] Speed optimisation. --- pandas/io/excel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 59bbf28d8fabc..3beafed094778 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -790,8 +790,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): # val, style) pass else: - # wks[startrow + cell.row][startcol + cell.col] = val - wks[1 + startrow + cell.row][1 + startcol + cell.col] = val + wks.set_cell_value(1 + startrow + cell.row, + 1 + startcol + cell.col, + val) register_writer(_PyExcelerate) From 44ae9ae33f6e839f7ca918ceb4411a398c1005d6 Mon Sep 17 00:00:00 2001 From: John McNamara Date: Mon, 7 Oct 2013 23:56:42 +0100 Subject: [PATCH 4/4] Point pyexcelerate install to master. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0fc29d774d6e5..f46d9c4735ca4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ install: - echo "Waldo2" - ci/install.sh # Temp testing measure while waiting for PyPi release. - - pip install git+git://github.com:kz26/PyExcelerate.git + - pip install git+git://github.com/kz26/PyExcelerate.git before_script: - mysql -e 'create database pandas_nosetest;'