Skip to content

Commit 072e40b

Browse files
Merge pull request #8548 from iosonofabio/excel_dtype
BUG: inconsistent and undocumented option "converters" to read_excel
2 parents cda4015 + 89d4871 commit 072e40b

File tree

6 files changed

+68
-3
lines changed

6 files changed

+68
-3
lines changed

doc/source/io.rst

+21
Original file line numberDiff line numberDiff line change
@@ -1992,6 +1992,27 @@ indices to be parsed.
19921992
19931993
read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3])
19941994
1995+
.. note::
1996+
1997+
It is possible to transform the contents of Excel cells via the `converters`
1998+
option. For instance, to convert a column to boolean:
1999+
2000+
.. code-block:: python
2001+
2002+
read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool})
2003+
2004+
This options handles missing values and treats exceptions in the converters
2005+
as missing data. Transformations are applied cell by cell rather than to the
2006+
column as a whole, so the array dtype is not guaranteed. For instance, a
2007+
column of integers with missing values cannot be transformed to an array
2008+
with integer dtype, because NaN is strictly a float. You can manually mask
2009+
missing data to recover integer dtype:
2010+
2011+
.. code-block:: python
2012+
2013+
cfun = lambda x: int(x) if x else -1
2014+
read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})
2015+
19952016
To write a DataFrame object to a sheet of an Excel file, you can use the
19962017
``to_excel`` instance method. The arguments are largely the same as ``to_csv``
19972018
described above, the first argument being the name of the excel file, and the

pandas/io/excel.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ def read_excel(io, sheetname=0, **kwds):
8383
Rows to skip at the beginning (0-indexed)
8484
skip_footer : int, default 0
8585
Rows at the end to skip (0-indexed)
86+
converters : dict, default None
87+
Dict of functions for converting values in certain columns. Keys can
88+
either be integers or column labels, values are functions that take one
89+
input argument, the Excel cell content, and return the transformed
90+
content.
8691
index_col : int, default None
8792
Column to use as the row labels of the DataFrame. Pass None if
8893
there is no such column
@@ -175,7 +180,7 @@ def __init__(self, io, **kwds):
175180
def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
176181
index_col=None, parse_cols=None, parse_dates=False,
177182
date_parser=None, na_values=None, thousands=None, chunksize=None,
178-
convert_float=True, has_index_names=False, **kwds):
183+
convert_float=True, has_index_names=False, converters=None, **kwds):
179184
"""Read an Excel table into DataFrame
180185
181186
Parameters
@@ -188,6 +193,9 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
188193
Rows to skip at the beginning (0-indexed)
189194
skip_footer : int, default 0
190195
Rows at the end to skip (0-indexed)
196+
converters : dict, default None
197+
Dict of functions for converting values in certain columns. Keys can
198+
either be integers or column labels
191199
index_col : int, default None
192200
Column to use as the row labels of the DataFrame. Pass None if
193201
there is no such column
@@ -235,6 +243,7 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
235243
thousands=thousands, chunksize=chunksize,
236244
skip_footer=skip_footer,
237245
convert_float=convert_float,
246+
converters=converters,
238247
**kwds)
239248

240249
def _should_parse(self, i, parse_cols):

pandas/io/parsers.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ class ParserWarning(Warning):
127127
Return TextFileReader object for iteration
128128
skipfooter : int, default 0
129129
Number of lines at bottom of file to skip (Unsupported with engine='c')
130-
converters : dict. optional
130+
converters : dict, default None
131131
Dict of functions for converting values in certain columns. Keys can either
132132
be integers or column labels
133133
verbose : boolean, default False
@@ -983,8 +983,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
983983
na_fvalues)
984984
coerce_type = True
985985
if conv_f is not None:
986-
values = lib.map_infer(values, conv_f)
986+
try:
987+
values = lib.map_infer(values, conv_f)
988+
except ValueError:
989+
mask = lib.ismember(values, na_values).view(np.uin8)
990+
values = lib.map_infer_mask(values, conv_f, mask)
987991
coerce_type = False
992+
988993
cvals, na_count = self._convert_types(
989994
values, set(col_na_values) | col_na_fvalues, coerce_type)
990995
result[c] = cvals
@@ -1269,6 +1274,11 @@ def TextParser(*args, **kwds):
12691274
Row numbers to skip
12701275
skip_footer : int
12711276
Number of line at bottom of file to skip
1277+
converters : dict, default None
1278+
Dict of functions for converting values in certain columns. Keys can
1279+
either be integers or column labels, values are functions that take one
1280+
input argument, the cell (not column) content, and return the
1281+
transformed content.
12721282
encoding : string, default None
12731283
Encoding to use for UTF when reading/writing (ex. 'utf-8')
12741284
squeeze : boolean, default False
6 KB
Binary file not shown.
4.7 KB
Binary file not shown.

pandas/io/tests/test_excel.py

+25
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,31 @@ def test_reader_special_dtypes(self):
399399
convert_float=False)
400400
tm.assert_frame_equal(actual, no_convert_float)
401401

402+
# GH8212 - support for converters and missing values
403+
def test_reader_converters(self):
404+
_skip_if_no_xlrd()
405+
406+
expected = DataFrame.from_items([
407+
("IntCol", [1, 2, -3, -1000, 0]),
408+
("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]),
409+
("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']),
410+
("StrCol", ['1', np.nan, '3', '4', '5']),
411+
])
412+
413+
converters = {'IntCol': lambda x: int(x) if x != '' else -1000,
414+
'FloatCol': lambda x: 10 * x if x else np.nan,
415+
2: lambda x: 'Found' if x != '' else 'Not found',
416+
3: lambda x: str(x) if x else '',
417+
}
418+
419+
xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx')
420+
xls_path = os.path.join(self.dirpath, 'test_converters.xls')
421+
422+
# should read in correctly and set types of single cells (not array dtypes)
423+
for path in (xls_path, xlsx_path):
424+
actual = read_excel(path, 'Sheet1', converters=converters)
425+
tm.assert_frame_equal(actual, expected)
426+
402427
def test_reader_seconds(self):
403428
# Test reading times with and without milliseconds. GH5945.
404429
_skip_if_no_xlrd()

0 commit comments

Comments
 (0)