Skip to content

API: add dtype param to read_excel #14786

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 3, 2016
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2538,6 +2538,20 @@ missing data to recover integer dtype:
cfun = lambda x: int(x) if x else -1
read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})

dtype Specifications
++++++++++++++++++++

.. versionadded:: 0.20

As an alternative to converters, the type for an entire column can
be specified using the `dtype` keyword, which takes a dictionary
mapping column names to types. To interpret data with
no type inference, use the type `str` or `object`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe double backticks here for str and object? (ok for dtype two lines above, as this is a keyword)


.. code-block:: python

read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str})

.. _io.excel_writer:

Writing Excel Files
Expand Down
6 changes: 3 additions & 3 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ New features
~~~~~~~~~~~~


``read_csv`` supports ``dtype`` keyword for python engine
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
``dtype`` keyword for data io
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs <io.dtypes>` for more information.
Expand All @@ -35,7 +35,7 @@ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying t
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes

The ``dtype`` keyword argument is also now supported in the :func:`read_fwf` function for parsing
fixed-width text files.
fixed-width text files, and :func:`read_excel` for parsing Excel files.

.. ipython:: python

Expand Down
17 changes: 13 additions & 4 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,14 @@
either be integers or column labels, values are functions that take one
input argument, the Excel cell content, and return the transformed
content.
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
Use `str` or `object` to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.

.. versionadded:: 0.20.0

true_values : list, default None
Values to consider as True

Expand Down Expand Up @@ -184,8 +192,8 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0,
index_col=None, names=None, parse_cols=None, parse_dates=False,
date_parser=None, na_values=None, thousands=None,
convert_float=True, has_index_names=None, converters=None,
true_values=None, false_values=None, engine=None, squeeze=False,
**kwds):
dtype=None, true_values=None, false_values=None, engine=None,
squeeze=False, **kwds):

if not isinstance(io, ExcelFile):
io = ExcelFile(io, engine=engine)
Expand All @@ -195,7 +203,7 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0,
index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates,
date_parser=date_parser, na_values=na_values, thousands=thousands,
convert_float=convert_float, has_index_names=has_index_names,
skip_footer=skip_footer, converters=converters,
skip_footer=skip_footer, converters=converters, dtype=dtype,
true_values=true_values, false_values=false_values, squeeze=squeeze,
**kwds)

Expand Down Expand Up @@ -318,7 +326,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None,
parse_cols=None, parse_dates=False, date_parser=None,
na_values=None, thousands=None, convert_float=True,
true_values=None, false_values=None, verbose=False,
squeeze=False, **kwds):
dtype=None, squeeze=False, **kwds):

skipfooter = kwds.pop('skipfooter', None)
if skipfooter is not None:
Expand Down Expand Up @@ -501,6 +509,7 @@ def _parse_cell(cell_contents, cell_typ):
skiprows=skiprows,
skipfooter=skip_footer,
squeeze=squeeze,
dtype=dtype,
**kwds)

output[asheetname] = parser.read()
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pandas.types.common import (is_integer, _ensure_object,
is_list_like, is_integer_dtype,
is_float, is_dtype_equal,
is_object_dtype,
is_object_dtype, is_string_dtype,
is_scalar, is_categorical_dtype)
from pandas.types.missing import isnull
from pandas.types.cast import _astype_nansafe
Expand Down Expand Up @@ -1329,7 +1329,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
try_num_bool=False)
else:
# skip inference if specified dtype is object
try_num_bool = not (cast_type and is_object_dtype(cast_type))
try_num_bool = not (cast_type and is_string_dtype(cast_type))

# general type inference and conversion
cvals, na_count = self._infer_types(
Expand Down
Binary file added pandas/io/tests/data/testdtype.xls
Binary file not shown.
Binary file added pandas/io/tests/data/testdtype.xlsm
Binary file not shown.
Binary file added pandas/io/tests/data/testdtype.xlsx
Binary file not shown.
27 changes: 27 additions & 0 deletions pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,33 @@ def test_reader_converters(self):
actual = self.get_exceldf(basename, 'Sheet1', converters=converters)
tm.assert_frame_equal(actual, expected)

def test_reader_dtype(self):
# GH 8212
basename = 'testdtype'
actual = self.get_exceldf(basename)

expected = DataFrame({
'a': [1, 2, 3, 4],
'b': [2.5, 3.5, 4.5, 5.5],
'c': [1, 2, 3, 4],
'd': [1.0, 2.0, np.nan, 4.0]}).reindex(
columns=['a', 'b', 'c', 'd'])

tm.assert_frame_equal(actual, expected)

actual = self.get_exceldf(basename,
dtype={'a': 'float64',
'b': 'float32',
'c': str})

expected['a'] = expected['a'].astype('float64')
expected['b'] = expected['b'].astype('float32')
expected['c'] = ['001', '002', '003', '004']
tm.assert_frame_equal(actual, expected)

with tm.assertRaises(ValueError):
actual = self.get_exceldf(basename, dtype={'d': 'int64'})

def test_reading_all_sheets(self):
# Test reading all sheetnames by setting sheetname to None,
# Ensure a dict is returned.
Expand Down