Skip to content

Commit d0597b8

Browse files
author
Fabio Zanini
committed
BUG: "converters" in read_excel with missing data
1 parent 4faf620 commit d0597b8

File tree

6 files changed

+64
-3
lines changed

6 files changed

+64
-3
lines changed

doc/source/io.rst

+24
Original file line numberDiff line numberDiff line change
@@ -1992,6 +1992,30 @@ indices to be parsed.
19921992
19931993
read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3])
19941994
1995+
.. note::
1996+
1997+
It is possible to transform the contents of Excel cells via the `converters`
1998+
option. It accepts a dictionary of functions: the keys are the names or
1999+
indices of columns to be transformed, the values are functions that take one
2000+
input argument, the Excel cell content, and return the transformed content.
2001+
For instance, to convert a column to boolean:
2002+
2003+
.. code-block:: python
2004+
2005+
read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool})
2006+
2007+
This options handles missing values and treats exceptions in the converters
2008+
as missing data. Transformations are applied cell by cell rather than to the
2009+
column as a whole, so the array dtype is not guaranteed. For instance, a
2010+
column of integers with missing values cannot be transformed to an array
2011+
with integer dtype, because NaN is strictly a float. You can manually mask
2012+
missing data to recover integer dtype:
2013+
2014+
.. code-block:: python
2015+
2016+
cfun = lambda x: int(x) if x else -1
2017+
read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})
2018+
19952019
To write a DataFrame object to a sheet of an Excel file, you can use the
19962020
``to_excel`` instance method. The arguments are largely the same as ``to_csv``
19972021
described above, the first argument being the name of the excel file, and the

pandas/io/excel.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ def read_excel(io, sheetname=0, **kwds):
8383
Rows to skip at the beginning (0-indexed)
8484
skip_footer : int, default 0
8585
Rows at the end to skip (0-indexed)
86+
converters : dict, default None
87+
Dict of functions for converting values in certain columns. Keys can
88+
either be integers or column labels
8689
index_col : int, default None
8790
Column to use as the row labels of the DataFrame. Pass None if
8891
there is no such column
@@ -175,7 +178,7 @@ def __init__(self, io, **kwds):
175178
def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
176179
index_col=None, parse_cols=None, parse_dates=False,
177180
date_parser=None, na_values=None, thousands=None, chunksize=None,
178-
convert_float=True, has_index_names=False, **kwds):
181+
convert_float=True, has_index_names=False, converters=None, **kwds):
179182
"""Read an Excel table into DataFrame
180183
181184
Parameters
@@ -188,6 +191,9 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
188191
Rows to skip at the beginning (0-indexed)
189192
skip_footer : int, default 0
190193
Rows at the end to skip (0-indexed)
194+
converters : dict, default None
195+
Dict of functions for converting values in certain columns. Keys can
196+
either be integers or column labels
191197
index_col : int, default None
192198
Column to use as the row labels of the DataFrame. Pass None if
193199
there is no such column
@@ -235,6 +241,7 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
235241
thousands=thousands, chunksize=chunksize,
236242
skip_footer=skip_footer,
237243
convert_float=convert_float,
244+
converters=converters,
238245
**kwds)
239246

240247
def _should_parse(self, i, parse_cols):

pandas/io/parsers.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ class ParserWarning(Warning):
127127
Return TextFileReader object for iteration
128128
skipfooter : int, default 0
129129
Number of lines at bottom of file to skip (Unsupported with engine='c')
130-
converters : dict. optional
130+
converters : dict, default None
131131
Dict of functions for converting values in certain columns. Keys can either
132132
be integers or column labels
133133
verbose : boolean, default False
@@ -983,8 +983,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
983983
na_fvalues)
984984
coerce_type = True
985985
if conv_f is not None:
986-
values = lib.map_infer(values, conv_f)
986+
try:
987+
values = lib.map_infer(values, conv_f)
988+
except ValueError:
989+
mask = lib.ismember(values, na_values).view(np.uin8)
990+
values = lib.map_infer_mask(values, conv_f, mask)
987991
coerce_type = False
992+
988993
cvals, na_count = self._convert_types(
989994
values, set(col_na_values) | col_na_fvalues, coerce_type)
990995
result[c] = cvals
6 KB
Binary file not shown.
4.7 KB
Binary file not shown.

pandas/io/tests/test_excel.py

+25
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,31 @@ def test_reader_special_dtypes(self):
399399
convert_float=False)
400400
tm.assert_frame_equal(actual, no_convert_float)
401401

402+
# GH8212 - support for converters and missing values
403+
def test_reader_converters(self):
404+
_skip_if_no_xlrd()
405+
406+
expected = DataFrame.from_items([
407+
("IntCol", [1, 2, -3, -1000, 0]),
408+
("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]),
409+
("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']),
410+
("StrCol", ['1', np.nan, '3', '4', '5']),
411+
])
412+
413+
converters = {'IntCol': lambda x: int(x) if x != '' else -1000,
414+
'FloatCol': lambda x: 10 * x if x else np.nan,
415+
2: lambda x: 'Found' if x != '' else 'Not found',
416+
3: lambda x: str(x) if x else '',
417+
}
418+
419+
xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx')
420+
xls_path = os.path.join(self.dirpath, 'test_converters.xls')
421+
422+
# should read in correctly and set types of single cells (not array dtypes)
423+
for path in (xls_path, xlsx_path):
424+
actual = read_excel(path, 'Sheet1', converters=converters)
425+
tm.assert_frame_equal(actual, expected)
426+
402427
def test_reader_seconds(self):
403428
# Test reading times with and without milliseconds. GH5945.
404429
_skip_if_no_xlrd()

0 commit comments

Comments
 (0)