Skip to content

Commit 588e29d

Browse files
authored
API: add dtype param to read_excel (#14786)
* API add dtype param to read_excel * doc fixup
1 parent 56c3aae commit 588e29d

File tree

8 files changed

+59
-9
lines changed

8 files changed

+59
-9
lines changed

doc/source/io.rst

+14
Original file line numberDiff line numberDiff line change
@@ -2538,6 +2538,20 @@ missing data to recover integer dtype:
25382538
cfun = lambda x: int(x) if x else -1
25392539
read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})
25402540
2541+
dtype Specifications
2542+
++++++++++++++++++++
2543+
2544+
.. versionadded:: 0.20
2545+
2546+
As an alternative to converters, the type for an entire column can
2547+
be specified using the `dtype` keyword, which takes a dictionary
2548+
mapping column names to types. To interpret data with
2549+
no type inference, use the type ``str`` or ``object``.
2550+
2551+
.. code-block:: python
2552+
2553+
read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str})
2554+
25412555
.. _io.excel_writer:
25422556
25432557
Writing Excel Files

doc/source/whatsnew/v0.20.0.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ New features
2222
~~~~~~~~~~~~
2323

2424

25-
``read_csv`` supports ``dtype`` keyword for python engine
26-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
25+
``dtype`` keyword for data io
26+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2727

2828
The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
2929
is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs <io.dtypes>` for more information.
@@ -35,7 +35,7 @@ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying t
3535
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes
3636

3737
The ``dtype`` keyword argument is also now supported in the :func:`read_fwf` function for parsing
38-
fixed-width text files.
38+
fixed-width text files, and :func:`read_excel` for parsing Excel files.
3939

4040
.. ipython:: python
4141

pandas/io/excel.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@
8787
either be integers or column labels, values are functions that take one
8888
input argument, the Excel cell content, and return the transformed
8989
content.
90+
dtype : Type name or dict of column -> type, default None
91+
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
92+
Use `str` or `object` to preserve and not interpret dtype.
93+
If converters are specified, they will be applied INSTEAD
94+
of dtype conversion.
95+
96+
.. versionadded:: 0.20.0
97+
9098
true_values : list, default None
9199
Values to consider as True
92100
@@ -184,8 +192,8 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0,
184192
index_col=None, names=None, parse_cols=None, parse_dates=False,
185193
date_parser=None, na_values=None, thousands=None,
186194
convert_float=True, has_index_names=None, converters=None,
187-
true_values=None, false_values=None, engine=None, squeeze=False,
188-
**kwds):
195+
dtype=None, true_values=None, false_values=None, engine=None,
196+
squeeze=False, **kwds):
189197

190198
if not isinstance(io, ExcelFile):
191199
io = ExcelFile(io, engine=engine)
@@ -195,7 +203,7 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0,
195203
index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates,
196204
date_parser=date_parser, na_values=na_values, thousands=thousands,
197205
convert_float=convert_float, has_index_names=has_index_names,
198-
skip_footer=skip_footer, converters=converters,
206+
skip_footer=skip_footer, converters=converters, dtype=dtype,
199207
true_values=true_values, false_values=false_values, squeeze=squeeze,
200208
**kwds)
201209

@@ -318,7 +326,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None,
318326
parse_cols=None, parse_dates=False, date_parser=None,
319327
na_values=None, thousands=None, convert_float=True,
320328
true_values=None, false_values=None, verbose=False,
321-
squeeze=False, **kwds):
329+
dtype=None, squeeze=False, **kwds):
322330

323331
skipfooter = kwds.pop('skipfooter', None)
324332
if skipfooter is not None:
@@ -501,6 +509,7 @@ def _parse_cell(cell_contents, cell_typ):
501509
skiprows=skiprows,
502510
skipfooter=skip_footer,
503511
squeeze=squeeze,
512+
dtype=dtype,
504513
**kwds)
505514

506515
output[asheetname] = parser.read()

pandas/io/parsers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from pandas.types.common import (is_integer, _ensure_object,
1919
is_list_like, is_integer_dtype,
2020
is_float, is_dtype_equal,
21-
is_object_dtype,
21+
is_object_dtype, is_string_dtype,
2222
is_scalar, is_categorical_dtype)
2323
from pandas.types.missing import isnull
2424
from pandas.types.cast import _astype_nansafe
@@ -1329,7 +1329,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13291329
try_num_bool=False)
13301330
else:
13311331
# skip inference if specified dtype is object
1332-
try_num_bool = not (cast_type and is_object_dtype(cast_type))
1332+
try_num_bool = not (cast_type and is_string_dtype(cast_type))
13331333

13341334
# general type inference and conversion
13351335
cvals, na_count = self._infer_types(

pandas/io/tests/data/testdtype.xls

22 KB
Binary file not shown.

pandas/io/tests/data/testdtype.xlsm

8.32 KB
Binary file not shown.

pandas/io/tests/data/testdtype.xlsx

8.3 KB
Binary file not shown.

pandas/io/tests/test_excel.py

+27
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,33 @@ def test_reader_converters(self):
373373
actual = self.get_exceldf(basename, 'Sheet1', converters=converters)
374374
tm.assert_frame_equal(actual, expected)
375375

376+
def test_reader_dtype(self):
377+
# GH 8212
378+
basename = 'testdtype'
379+
actual = self.get_exceldf(basename)
380+
381+
expected = DataFrame({
382+
'a': [1, 2, 3, 4],
383+
'b': [2.5, 3.5, 4.5, 5.5],
384+
'c': [1, 2, 3, 4],
385+
'd': [1.0, 2.0, np.nan, 4.0]}).reindex(
386+
columns=['a', 'b', 'c', 'd'])
387+
388+
tm.assert_frame_equal(actual, expected)
389+
390+
actual = self.get_exceldf(basename,
391+
dtype={'a': 'float64',
392+
'b': 'float32',
393+
'c': str})
394+
395+
expected['a'] = expected['a'].astype('float64')
396+
expected['b'] = expected['b'].astype('float32')
397+
expected['c'] = ['001', '002', '003', '004']
398+
tm.assert_frame_equal(actual, expected)
399+
400+
with tm.assertRaises(ValueError):
401+
actual = self.get_exceldf(basename, dtype={'d': 'int64'})
402+
376403
def test_reading_all_sheets(self):
377404
# Test reading all sheetnames by setting sheetname to None,
378405
# Ensure a dict is returned.

0 commit comments

Comments
 (0)