Merge pull request #8548 from iosonofabio/excel_dtype

jorisvandenbossche · jorisvandenbossche · commit 072e40b0b5be · 2014-11-15T20:01:57.000+01:00
BUG: inconsistent and undocumented option "converters" to read_excel
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1992,6 +1992,27 @@ indices to be parsed.
 
    read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3])
 
+.. note::
+
+   It is possible to transform the contents of Excel cells via the `converters`
+   option. For instance, to convert a column to boolean:
+   
+   .. code-block:: python
+   
+      read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool})
+   
+   This options handles missing values and treats exceptions in the converters
+   as missing data. Transformations are applied cell by cell rather than to the
+   column as a whole, so the array dtype is not guaranteed. For instance, a
+   column of integers with missing values cannot be transformed to an array
+   with integer dtype, because NaN is strictly a float. You can manually mask
+   missing data to recover integer dtype:
+   
+   .. code-block:: python
+   
+      cfun = lambda x: int(x) if x else -1
+      read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})
+
 To write a DataFrame object to a sheet of an Excel file, you can use the
 ``to_excel`` instance method.  The arguments are largely the same as ``to_csv``
 described above, the first argument being the name of the excel file, and the
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -83,6 +83,11 @@ def read_excel(io, sheetname=0, **kwds):
         Rows to skip at the beginning (0-indexed)
     skip_footer : int, default 0
         Rows at the end to skip (0-indexed)
+    converters : dict, default None
+        Dict of functions for converting values in certain columns. Keys can
+        either be integers or column labels, values are functions that take one
+        input argument, the Excel cell content, and return the transformed
+        content.
     index_col : int, default None
         Column to use as the row labels of the DataFrame. Pass None if
         there is no such column
@@ -175,7 +180,7 @@ def __init__(self, io, **kwds):
     def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
               index_col=None, parse_cols=None, parse_dates=False,
               date_parser=None, na_values=None, thousands=None, chunksize=None,
-              convert_float=True, has_index_names=False, **kwds):
+              convert_float=True, has_index_names=False, converters=None, **kwds):
         """Read an Excel table into DataFrame
 
         Parameters
@@ -188,6 +193,9 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
             Rows to skip at the beginning (0-indexed)
         skip_footer : int, default 0
             Rows at the end to skip (0-indexed)
+        converters : dict, default None
+            Dict of functions for converting values in certain columns. Keys can
+            either be integers or column labels
         index_col : int, default None
             Column to use as the row labels of the DataFrame. Pass None if
             there is no such column
@@ -235,6 +243,7 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
                                  thousands=thousands, chunksize=chunksize,
                                  skip_footer=skip_footer,
                                  convert_float=convert_float,
+                                 converters=converters,
                                  **kwds)
 
     def _should_parse(self, i, parse_cols):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -127,7 +127,7 @@ class ParserWarning(Warning):
     Return TextFileReader object for iteration
 skipfooter : int, default 0
     Number of lines at bottom of file to skip (Unsupported with engine='c')
-converters : dict. optional
+converters : dict, default None
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels
 verbose : boolean, default False
@@ -983,8 +983,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                                                            na_fvalues)
             coerce_type = True
             if conv_f is not None:
-                values = lib.map_infer(values, conv_f)
+                try:
+                    values = lib.map_infer(values, conv_f)
+                except ValueError:
+                    mask = lib.ismember(values, na_values).view(np.uin8)
+                    values = lib.map_infer_mask(values, conv_f, mask)
                 coerce_type = False
+
             cvals, na_count = self._convert_types(
                 values, set(col_na_values) | col_na_fvalues, coerce_type)
             result[c] = cvals
@@ -1269,6 +1274,11 @@ def TextParser(*args, **kwds):
         Row numbers to skip
     skip_footer : int
         Number of line at bottom of file to skip
+    converters : dict, default None
+        Dict of functions for converting values in certain columns. Keys can
+        either be integers or column labels, values are functions that take one
+        input argument, the cell (not column) content, and return the
+        transformed content.
     encoding : string, default None
         Encoding to use for UTF when reading/writing (ex. 'utf-8')
     squeeze : boolean, default False
diff --git a/pandas/io/tests/data/test_converters.xls b/pandas/io/tests/data/test_converters.xls
diff --git a/pandas/io/tests/data/test_converters.xlsx b/pandas/io/tests/data/test_converters.xlsx
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -399,6 +399,31 @@ def test_reader_special_dtypes(self):
                            convert_float=False)
         tm.assert_frame_equal(actual, no_convert_float)
 
+    # GH8212 - support for converters and missing values
+    def test_reader_converters(self):
+        _skip_if_no_xlrd()
+
+        expected = DataFrame.from_items([
+            ("IntCol", [1, 2, -3, -1000, 0]),
+            ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]),
+            ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']),
+            ("StrCol", ['1', np.nan, '3', '4', '5']),
+        ])
+
+        converters = {'IntCol': lambda x: int(x) if x != '' else -1000,
+                      'FloatCol': lambda x: 10 * x if x else np.nan,
+                      2: lambda x: 'Found' if x != '' else 'Not found',
+                      3: lambda x: str(x) if x else '',
+                      }
+
+        xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx')
+        xls_path = os.path.join(self.dirpath, 'test_converters.xls')
+
+        # should read in correctly and set types of single cells (not array dtypes)
+        for path in (xls_path, xlsx_path):
+            actual = read_excel(path, 'Sheet1', converters=converters)
+            tm.assert_frame_equal(actual, expected)
+
     def test_reader_seconds(self):
         # Test reading times with and without milliseconds. GH5945.
         _skip_if_no_xlrd()