Skip to content

ENH: read_excel: try converting numeric to int #5394

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 1, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1839,6 +1839,13 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
df1.to_excel(writer, sheet_name='Sheet1')
df2.to_excel(writer, sheet_name='Sheet2')

.. note:: Wringing a little more performance out of ``read_excel``
Internally, Excel stores all numeric data as floats. Because this can
produce unexpected behavior when reading in data, pandas defaults to trying
to convert integers to floats if it doesn't lose information (``1.0 -->
1``). You can pass ``convert_float=False`` to disable this behavior, which
may give a slight performance improvement.

.. _io.excel.writers:

Excel writer engines
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ Improvements to existing features
closed])
- Fixed bug in `tools.plotting.andrews_curvres` so that lines are drawn grouped
by color as expected.
- ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int
by default. (:issue:`5394`)

API Changes
~~~~~~~~~~~
Expand Down
27 changes: 23 additions & 4 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ def read_excel(io, sheetname, **kwds):
engine: string, default None
If io is not a buffer or path, this must be set to identify io.
Acceptable values are None or xlrd
convert_float : boolean, default True
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
data will be read in as floats: Excel stores all numbers as floats
internally.

Returns
-------
Expand Down Expand Up @@ -142,7 +146,7 @@ def __init__(self, io, **kwds):
def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
index_col=None, parse_cols=None, parse_dates=False,
date_parser=None, na_values=None, thousands=None, chunksize=None,
**kwds):
convert_float=True, **kwds):
"""Read an Excel table into DataFrame

Parameters
Expand Down Expand Up @@ -172,6 +176,10 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
NaN values are overridden, otherwise they're appended to
verbose : boolean, default False
Indicate number of NA values placed in non-numeric columns
convert_float : boolean, default True
convert integral floats to int (i.e., 1.0 --> 1). If False, all
numeric data will be read in as floats: Excel stores all numbers as
floats internally.

Returns
-------
Expand All @@ -191,7 +199,9 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
parse_dates=parse_dates,
date_parser=date_parser, na_values=na_values,
thousands=thousands, chunksize=chunksize,
skip_footer=skip_footer, **kwds)
skip_footer=skip_footer,
convert_float=convert_float,
**kwds)

def _should_parse(self, i, parse_cols):

Expand Down Expand Up @@ -229,9 +239,11 @@ def _excel2num(x):
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
index_col=None, has_index_names=None, parse_cols=None,
parse_dates=False, date_parser=None, na_values=None,
thousands=None, chunksize=None, **kwds):
thousands=None, chunksize=None, convert_float=True,
**kwds):
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN)
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)

datemode = self.book.datemode
if isinstance(sheetname, compat.string_types):
Expand Down Expand Up @@ -260,6 +272,13 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
value = np.nan
elif typ == XL_CELL_BOOLEAN:
value = bool(value)
elif convert_float and typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less suprising
val = int(value)
if val == value:
value = val

row.append(value)

data.append(row)
Expand Down
Binary file added pandas/io/tests/data/test_types.xls
Binary file not shown.
Binary file added pandas/io/tests/data/test_types.xlsx
Binary file not shown.
101 changes: 75 additions & 26 deletions pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# pylint: disable=E1101

from pandas.compat import u, range, map
from datetime import datetime
import os
import unittest

Expand Down Expand Up @@ -306,6 +307,56 @@ def test_reader_closes_file(self):

self.assertTrue(f.closed)

def test_reader_special_dtypes(self):
_skip_if_no_xlrd()

expected = DataFrame.from_items([
("IntCol", [1, 2, -3, 4, 0]),
("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
("BoolCol", [True, False, True, True, False]),
("StrCol", [1, 2, 3, 4, 5]),
# GH5394 - this is why convert_float isn't vectorized
("Str2Col", ["a", 3, "c", "d", "e"]),
("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
datetime(1905, 1, 1), datetime(2013, 12, 14),
datetime(2015, 3, 14)])
])

xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
xls_path = os.path.join(self.dirpath, 'test_types.xls')

# should read in correctly and infer types
for path in (xls_path, xlsx_path):
actual = read_excel(path, 'Sheet1')
tm.assert_frame_equal(actual, expected)

# if not coercing number, then int comes in as float
float_expected = expected.copy()
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
float_expected.loc[1, "Str2Col"] = 3.0
for path in (xls_path, xlsx_path):
actual = read_excel(path, 'Sheet1', convert_float=False)
tm.assert_frame_equal(actual, float_expected)

# check setting Index (assuming xls and xlsx are the same here)
for icol, name in enumerate(expected.columns):
actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
exp = expected.set_index(name)
tm.assert_frame_equal(actual, exp)
tm.assert_frame_equal(actual2, exp)

# convert_float and converters should be different but both accepted
expected["StrCol"] = expected["StrCol"].apply(str)
actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
tm.assert_frame_equal(actual, expected)

no_convert_float = float_expected.copy()
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str},
convert_float=False)
tm.assert_frame_equal(actual, no_convert_float)


class ExcelWriterBase(SharedItems):
# Base class for test cases to run with different Excel writers.
Expand Down Expand Up @@ -390,7 +441,7 @@ def test_roundtrip(self):
tm.assert_frame_equal(self.frame, recons)

self.frame.to_excel(path, 'test1', na_rep='88')
recons = read_excel(path, 'test1', index_col=0, na_values=[88,88.0])
recons = read_excel(path, 'test1', index_col=0, na_values=[88, 88.0])
tm.assert_frame_equal(self.frame, recons)

def test_mixed(self):
Expand All @@ -417,6 +468,16 @@ def test_tsframe(self):
recons = reader.parse('test1')
tm.assert_frame_equal(df, recons)

def test_basics_with_nan(self):
_skip_if_no_xlrd()
ext = self.ext
path = '__tmp_to_excel_from_excel_int_types__.' + ext
self.frame['A'][:5] = nan
self.frame.to_excel(path, 'test1')
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
self.frame.to_excel(path, 'test1', header=False)
self.frame.to_excel(path, 'test1', index=False)

def test_int_types(self):
_skip_if_no_xlrd()
ext = self.ext
Expand All @@ -425,20 +486,22 @@ def test_int_types(self):
for np_type in (np.int8, np.int16, np.int32, np.int64):

with ensure_clean(path) as path:
self.frame['A'][:5] = nan

self.frame.to_excel(path, 'test1')
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
self.frame.to_excel(path, 'test1', header=False)
self.frame.to_excel(path, 'test1', index=False)

# Test np.int values read come back as float.
# Test np.int values read come back as int (rather than float
# which is Excel's format).
frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)),
dtype=np_type)
frame.to_excel(path, 'test1')
reader = ExcelFile(path)
recons = reader.parse('test1').astype(np_type)
tm.assert_frame_equal(frame, recons, check_dtype=False)
recons = reader.parse('test1')
int_frame = frame.astype(int)
tm.assert_frame_equal(int_frame, recons)
recons2 = read_excel(path, 'test1')
tm.assert_frame_equal(int_frame, recons2)

# test with convert_float=False comes back as float
float_frame = frame.astype(float)
recons = read_excel(path, 'test1', convert_float=False)
tm.assert_frame_equal(recons, float_frame)

def test_float_types(self):
_skip_if_no_xlrd()
Expand All @@ -447,13 +510,6 @@ def test_float_types(self):

for np_type in (np.float16, np.float32, np.float64):
with ensure_clean(path) as path:
self.frame['A'][:5] = nan

self.frame.to_excel(path, 'test1')
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
self.frame.to_excel(path, 'test1', header=False)
self.frame.to_excel(path, 'test1', index=False)

# Test np.float values read come back as float.
frame = DataFrame(np.random.random_sample(10), dtype=np_type)
frame.to_excel(path, 'test1')
Expand All @@ -468,13 +524,6 @@ def test_bool_types(self):

for np_type in (np.bool8, np.bool_):
with ensure_clean(path) as path:
self.frame['A'][:5] = nan

self.frame.to_excel(path, 'test1')
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
self.frame.to_excel(path, 'test1', header=False)
self.frame.to_excel(path, 'test1', index=False)

# Test np.bool values read come back as float.
frame = (DataFrame([1, 0, True, False], dtype=np_type))
frame.to_excel(path, 'test1')
Expand Down Expand Up @@ -1007,11 +1056,11 @@ def test_ExcelWriter_dispatch(self):
writer = ExcelWriter('apple.xls')
tm.assert_isinstance(writer, _XlwtWriter)


def test_register_writer(self):
# some awkward mocking to test out dispatch and such actually works
called_save = []
called_write_cells = []

class DummyClass(ExcelWriter):
called_save = False
called_write_cells = False
Expand Down