Merge pull request #5394 from jtratner/fix-int-handling-excel

jtratner · jtratner · commit fd2d7c27e2b1 · 2013-10-31T19:52:15.000-07:00
ENH: read_excel: try converting numeric to int
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1839,6 +1839,13 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
        df1.to_excel(writer, sheet_name='Sheet1')
        df2.to_excel(writer, sheet_name='Sheet2')
 
+.. note:: Wringing a little more performance out of ``read_excel``
+    Internally, Excel stores all numeric data as floats. Because this can
+    produce unexpected behavior when reading in data, pandas defaults to trying
+    to convert integers to floats if it doesn't lose information (``1.0 -->
+    1``).  You can pass ``convert_float=False`` to disable this behavior, which
+    may give a slight performance improvement.
+
 .. _io.excel.writers:
 
 Excel writer engines
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -207,6 +207,8 @@ Improvements to existing features
     closed])
   - Fixed bug in `tools.plotting.andrews_curvres` so that lines are drawn grouped
     by color as expected.
+  - ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int
+    by default. (:issue:`5394`)
 
 API Changes
 ~~~~~~~~~~~
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -83,6 +83,10 @@ def read_excel(io, sheetname, **kwds):
     engine: string, default None
         If io is not a buffer or path, this must be set to identify io.
         Acceptable values are None or xlrd
+    convert_float : boolean, default True
+        convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
+        data will be read in as floats: Excel stores all numbers as floats
+        internally.
 
     Returns
     -------
@@ -142,7 +146,7 @@ def __init__(self, io, **kwds):
     def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
               index_col=None, parse_cols=None, parse_dates=False,
               date_parser=None, na_values=None, thousands=None, chunksize=None,
-              **kwds):
+              convert_float=True, **kwds):
         """Read an Excel table into DataFrame
 
         Parameters
@@ -172,6 +176,10 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
             NaN values are overridden, otherwise they're appended to
         verbose : boolean, default False
             Indicate number of NA values placed in non-numeric columns
+        convert_float : boolean, default True
+            convert integral floats to int (i.e., 1.0 --> 1). If False, all
+            numeric data will be read in as floats: Excel stores all numbers as
+            floats internally.
 
         Returns
         -------
@@ -191,7 +199,9 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
                                  parse_dates=parse_dates,
                                  date_parser=date_parser, na_values=na_values,
                                  thousands=thousands, chunksize=chunksize,
-                                 skip_footer=skip_footer, **kwds)
+                                 skip_footer=skip_footer,
+                                 convert_float=convert_float,
+                                 **kwds)
 
     def _should_parse(self, i, parse_cols):
 
@@ -229,9 +239,11 @@ def _excel2num(x):
     def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
                      index_col=None, has_index_names=None, parse_cols=None,
                      parse_dates=False, date_parser=None, na_values=None,
-                     thousands=None, chunksize=None, **kwds):
+                     thousands=None, chunksize=None, convert_float=True,
+                     **kwds):
         from xlrd import (xldate_as_tuple, XL_CELL_DATE,
-                          XL_CELL_ERROR, XL_CELL_BOOLEAN)
+                          XL_CELL_ERROR, XL_CELL_BOOLEAN,
+                          XL_CELL_NUMBER)
 
         datemode = self.book.datemode
         if isinstance(sheetname, compat.string_types):
@@ -260,6 +272,13 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
                         value = np.nan
                     elif typ == XL_CELL_BOOLEAN:
                         value = bool(value)
+                    elif convert_float and typ == XL_CELL_NUMBER:
+                        # GH5394 - Excel 'numbers' are always floats
+                        # it's a minimal perf hit and less suprising
+                        val = int(value)
+                        if val == value:
+                            value = val
+
                     row.append(value)
 
             data.append(row)
diff --git a/pandas/io/tests/data/test_types.xls b/pandas/io/tests/data/test_types.xls
diff --git a/pandas/io/tests/data/test_types.xlsx b/pandas/io/tests/data/test_types.xlsx
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -1,6 +1,7 @@
 # pylint: disable=E1101
 
 from pandas.compat import u, range, map
+from datetime import datetime
 import os
 import unittest
 
@@ -306,6 +307,56 @@ def test_reader_closes_file(self):
 
         self.assertTrue(f.closed)
 
+    def test_reader_special_dtypes(self):
+        _skip_if_no_xlrd()
+
+        expected = DataFrame.from_items([
+            ("IntCol", [1, 2, -3, 4, 0]),
+            ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
+            ("BoolCol", [True, False, True, True, False]),
+            ("StrCol", [1, 2, 3, 4, 5]),
+            # GH5394 - this is why convert_float isn't vectorized
+            ("Str2Col", ["a", 3, "c", "d", "e"]),
+            ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
+                         datetime(1905, 1, 1), datetime(2013, 12, 14),
+                         datetime(2015, 3, 14)])
+        ])
+
+        xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
+        xls_path = os.path.join(self.dirpath, 'test_types.xls')
+
+        # should read in correctly and infer types
+        for path in (xls_path, xlsx_path):
+            actual = read_excel(path, 'Sheet1')
+            tm.assert_frame_equal(actual, expected)
+
+        # if not coercing number, then int comes in as float
+        float_expected = expected.copy()
+        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
+        float_expected.loc[1, "Str2Col"] = 3.0
+        for path in (xls_path, xlsx_path):
+            actual = read_excel(path, 'Sheet1', convert_float=False)
+            tm.assert_frame_equal(actual, float_expected)
+
+        # check setting Index (assuming xls and xlsx are the same here)
+        for icol, name in enumerate(expected.columns):
+            actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
+            actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
+            exp = expected.set_index(name)
+            tm.assert_frame_equal(actual, exp)
+            tm.assert_frame_equal(actual2, exp)
+
+        # convert_float and converters should be different but both accepted
+        expected["StrCol"] = expected["StrCol"].apply(str)
+        actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
+        tm.assert_frame_equal(actual, expected)
+
+        no_convert_float = float_expected.copy()
+        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
+        actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str},
+                           convert_float=False)
+        tm.assert_frame_equal(actual, no_convert_float)
+
 
 class ExcelWriterBase(SharedItems):
     # Base class for test cases to run with different Excel writers.
@@ -390,7 +441,7 @@ def test_roundtrip(self):
             tm.assert_frame_equal(self.frame, recons)
 
             self.frame.to_excel(path, 'test1', na_rep='88')
-            recons = read_excel(path, 'test1', index_col=0, na_values=[88,88.0])
+            recons = read_excel(path, 'test1', index_col=0, na_values=[88, 88.0])
             tm.assert_frame_equal(self.frame, recons)
 
     def test_mixed(self):
@@ -417,6 +468,16 @@ def test_tsframe(self):
             recons = reader.parse('test1')
             tm.assert_frame_equal(df, recons)
 
+    def test_basics_with_nan(self):
+        _skip_if_no_xlrd()
+        ext = self.ext
+        path = '__tmp_to_excel_from_excel_int_types__.' + ext
+        self.frame['A'][:5] = nan
+        self.frame.to_excel(path, 'test1')
+        self.frame.to_excel(path, 'test1', cols=['A', 'B'])
+        self.frame.to_excel(path, 'test1', header=False)
+        self.frame.to_excel(path, 'test1', index=False)
+
     def test_int_types(self):
         _skip_if_no_xlrd()
         ext = self.ext
@@ -425,20 +486,22 @@ def test_int_types(self):
         for np_type in (np.int8, np.int16, np.int32, np.int64):
 
             with ensure_clean(path) as path:
-                self.frame['A'][:5] = nan
-
-                self.frame.to_excel(path, 'test1')
-                self.frame.to_excel(path, 'test1', cols=['A', 'B'])
-                self.frame.to_excel(path, 'test1', header=False)
-                self.frame.to_excel(path, 'test1', index=False)
-
-                # Test np.int values read come back as float.
+                # Test np.int values read come back as int (rather than float
+                # which is Excel's format).
                 frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)),
                                   dtype=np_type)
                 frame.to_excel(path, 'test1')
                 reader = ExcelFile(path)
-                recons = reader.parse('test1').astype(np_type)
-                tm.assert_frame_equal(frame, recons, check_dtype=False)
+                recons = reader.parse('test1')
+                int_frame = frame.astype(int)
+                tm.assert_frame_equal(int_frame, recons)
+                recons2 = read_excel(path, 'test1')
+                tm.assert_frame_equal(int_frame, recons2)
+
+                # test with convert_float=False comes back as float
+                float_frame = frame.astype(float)
+                recons = read_excel(path, 'test1', convert_float=False)
+                tm.assert_frame_equal(recons, float_frame)
 
     def test_float_types(self):
         _skip_if_no_xlrd()
@@ -447,13 +510,6 @@ def test_float_types(self):
 
         for np_type in (np.float16, np.float32, np.float64):
             with ensure_clean(path) as path:
-                self.frame['A'][:5] = nan
-
-                self.frame.to_excel(path, 'test1')
-                self.frame.to_excel(path, 'test1', cols=['A', 'B'])
-                self.frame.to_excel(path, 'test1', header=False)
-                self.frame.to_excel(path, 'test1', index=False)
-
                 # Test np.float values read come back as float.
                 frame = DataFrame(np.random.random_sample(10), dtype=np_type)
                 frame.to_excel(path, 'test1')
@@ -468,13 +524,6 @@ def test_bool_types(self):
 
         for np_type in (np.bool8, np.bool_):
             with ensure_clean(path) as path:
-                self.frame['A'][:5] = nan
-
-                self.frame.to_excel(path, 'test1')
-                self.frame.to_excel(path, 'test1', cols=['A', 'B'])
-                self.frame.to_excel(path, 'test1', header=False)
-                self.frame.to_excel(path, 'test1', index=False)
-
                 # Test np.bool values read come back as float.
                 frame = (DataFrame([1, 0, True, False], dtype=np_type))
                 frame.to_excel(path, 'test1')
@@ -1007,11 +1056,11 @@ def test_ExcelWriter_dispatch(self):
         writer = ExcelWriter('apple.xls')
         tm.assert_isinstance(writer, _XlwtWriter)
 
-
     def test_register_writer(self):
         # some awkward mocking to test out dispatch and such actually works
         called_save = []
         called_write_cells = []
+
         class DummyClass(ExcelWriter):
             called_save = False
             called_write_cells = False