diff --git a/doc/source/io.rst b/doc/source/io.rst index 0842893800dd5..1a879866c5516 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1839,6 +1839,13 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. df1.to_excel(writer, sheet_name='Sheet1') df2.to_excel(writer, sheet_name='Sheet2') +.. note:: Wringing a little more performance out of ``read_excel`` + Internally, Excel stores all numeric data as floats. Because this can + produce unexpected behavior when reading in data, pandas defaults to trying + to convert integers to floats if it doesn't lose information (``1.0 --> + 1``). You can pass ``convert_float=False`` to disable this behavior, which + may give a slight performance improvement. + .. _io.excel.writers: Excel writer engines diff --git a/doc/source/release.rst b/doc/source/release.rst index 8a0e31859c185..721ef9a1cbbf3 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -207,6 +207,8 @@ Improvements to existing features closed]) - Fixed bug in `tools.plotting.andrews_curvres` so that lines are drawn grouped by color as expected. + - ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int + by default. (:issue:`5394`) API Changes ~~~~~~~~~~~ diff --git a/pandas/io/excel.py b/pandas/io/excel.py index ae844d1eeb5fc..42c212caf41ca 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -83,6 +83,10 @@ def read_excel(io, sheetname, **kwds): engine: string, default None If io is not a buffer or path, this must be set to identify io. Acceptable values are None or xlrd + convert_float : boolean, default True + convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally. Returns ------- @@ -142,7 +146,7 @@ def __init__(self, io, **kwds): def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, - **kwds): + convert_float=True, **kwds): """Read an Excel table into DataFrame Parameters @@ -172,6 +176,10 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, NaN values are overridden, otherwise they're appended to verbose : boolean, default False Indicate number of NA values placed in non-numeric columns + convert_float : boolean, default True + convert integral floats to int (i.e., 1.0 --> 1). If False, all + numeric data will be read in as floats: Excel stores all numbers as + floats internally. Returns ------- @@ -191,7 +199,9 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, thousands=thousands, chunksize=chunksize, - skip_footer=skip_footer, **kwds) + skip_footer=skip_footer, + convert_float=convert_float, + **kwds) def _should_parse(self, i, parse_cols): @@ -229,9 +239,11 @@ def _excel2num(x): def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, - thousands=None, chunksize=None, **kwds): + thousands=None, chunksize=None, convert_float=True, + **kwds): from xlrd import (xldate_as_tuple, XL_CELL_DATE, - XL_CELL_ERROR, XL_CELL_BOOLEAN) + XL_CELL_ERROR, XL_CELL_BOOLEAN, + XL_CELL_NUMBER) datemode = self.book.datemode if isinstance(sheetname, compat.string_types): @@ -260,6 +272,13 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) + elif convert_float and typ == XL_CELL_NUMBER: + # GH5394 - Excel 'numbers' are always floats + # it's a minimal perf hit and less suprising + val = int(value) + if val == value: + value = val + row.append(value) data.append(row) diff --git a/pandas/io/tests/data/test_types.xls b/pandas/io/tests/data/test_types.xls new file mode 100644 index 0000000000000..2d387603a8307 Binary files /dev/null and b/pandas/io/tests/data/test_types.xls differ diff --git a/pandas/io/tests/data/test_types.xlsx b/pandas/io/tests/data/test_types.xlsx new file mode 100644 index 0000000000000..ef749e04ff3b5 Binary files /dev/null and b/pandas/io/tests/data/test_types.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 15130c552c8a8..311a0953f1c02 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1,6 +1,7 @@ # pylint: disable=E1101 from pandas.compat import u, range, map +from datetime import datetime import os import unittest @@ -306,6 +307,56 @@ def test_reader_closes_file(self): self.assertTrue(f.closed) + def test_reader_special_dtypes(self): + _skip_if_no_xlrd() + + expected = DataFrame.from_items([ + ("IntCol", [1, 2, -3, 4, 0]), + ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), + ("BoolCol", [True, False, True, True, False]), + ("StrCol", [1, 2, 3, 4, 5]), + # GH5394 - this is why convert_float isn't vectorized + ("Str2Col", ["a", 3, "c", "d", "e"]), + ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), + datetime(1905, 1, 1), datetime(2013, 12, 14), + datetime(2015, 3, 14)]) + ]) + + xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx') + xls_path = os.path.join(self.dirpath, 'test_types.xls') + + # should read in correctly and infer types + for path in (xls_path, xlsx_path): + actual = read_excel(path, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + # if not coercing number, then int comes in as float + float_expected = expected.copy() + float_expected["IntCol"] = float_expected["IntCol"].astype(float) + float_expected.loc[1, "Str2Col"] = 3.0 + for path in (xls_path, xlsx_path): + actual = read_excel(path, 'Sheet1', convert_float=False) + tm.assert_frame_equal(actual, float_expected) + + # check setting Index (assuming xls and xlsx are the same here) + for icol, name in enumerate(expected.columns): + actual = read_excel(xlsx_path, 'Sheet1', index_col=icol) + actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name) + exp = expected.set_index(name) + tm.assert_frame_equal(actual, exp) + tm.assert_frame_equal(actual2, exp) + + # convert_float and converters should be different but both accepted + expected["StrCol"] = expected["StrCol"].apply(str) + actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}) + tm.assert_frame_equal(actual, expected) + + no_convert_float = float_expected.copy() + no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) + actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}, + convert_float=False) + tm.assert_frame_equal(actual, no_convert_float) + class ExcelWriterBase(SharedItems): # Base class for test cases to run with different Excel writers. @@ -390,7 +441,7 @@ def test_roundtrip(self): tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, 'test1', na_rep='88') - recons = read_excel(path, 'test1', index_col=0, na_values=[88,88.0]) + recons = read_excel(path, 'test1', index_col=0, na_values=[88, 88.0]) tm.assert_frame_equal(self.frame, recons) def test_mixed(self): @@ -417,6 +468,16 @@ def test_tsframe(self): recons = reader.parse('test1') tm.assert_frame_equal(df, recons) + def test_basics_with_nan(self): + _skip_if_no_xlrd() + ext = self.ext + path = '__tmp_to_excel_from_excel_int_types__.' + ext + self.frame['A'][:5] = nan + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', cols=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + def test_int_types(self): _skip_if_no_xlrd() ext = self.ext @@ -425,20 +486,22 @@ def test_int_types(self): for np_type in (np.int8, np.int16, np.int32, np.int64): with ensure_clean(path) as path: - self.frame['A'][:5] = nan - - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', cols=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - - # Test np.int values read come back as float. + # Test np.int values read come back as int (rather than float + # which is Excel's format). frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) frame.to_excel(path, 'test1') reader = ExcelFile(path) - recons = reader.parse('test1').astype(np_type) - tm.assert_frame_equal(frame, recons, check_dtype=False) + recons = reader.parse('test1') + int_frame = frame.astype(int) + tm.assert_frame_equal(int_frame, recons) + recons2 = read_excel(path, 'test1') + tm.assert_frame_equal(int_frame, recons2) + + # test with convert_float=False comes back as float + float_frame = frame.astype(float) + recons = read_excel(path, 'test1', convert_float=False) + tm.assert_frame_equal(recons, float_frame) def test_float_types(self): _skip_if_no_xlrd() @@ -447,13 +510,6 @@ def test_float_types(self): for np_type in (np.float16, np.float32, np.float64): with ensure_clean(path) as path: - self.frame['A'][:5] = nan - - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', cols=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - # Test np.float values read come back as float. frame = DataFrame(np.random.random_sample(10), dtype=np_type) frame.to_excel(path, 'test1') @@ -468,13 +524,6 @@ def test_bool_types(self): for np_type in (np.bool8, np.bool_): with ensure_clean(path) as path: - self.frame['A'][:5] = nan - - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', cols=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - # Test np.bool values read come back as float. frame = (DataFrame([1, 0, True, False], dtype=np_type)) frame.to_excel(path, 'test1') @@ -1007,11 +1056,11 @@ def test_ExcelWriter_dispatch(self): writer = ExcelWriter('apple.xls') tm.assert_isinstance(writer, _XlwtWriter) - def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works called_save = [] called_write_cells = [] + class DummyClass(ExcelWriter): called_save = False called_write_cells = False