Skip to content

Commit fd2d7c2

Browse files
committed
Merge pull request #5394 from jtratner/fix-int-handling-excel
ENH: read_excel: try converting numeric to int
2 parents e893f43 + 1ea301e commit fd2d7c2

File tree

6 files changed

+107
-30
lines changed

6 files changed

+107
-30
lines changed

doc/source/io.rst

+7
Original file line numberDiff line numberDiff line change
@@ -1839,6 +1839,13 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
18391839
df1.to_excel(writer, sheet_name='Sheet1')
18401840
df2.to_excel(writer, sheet_name='Sheet2')
18411841
1842+
.. note:: Wringing a little more performance out of ``read_excel``
1843+
Internally, Excel stores all numeric data as floats. Because this can
1844+
produce unexpected behavior when reading in data, pandas defaults to trying
1845+
to convert integers to floats if it doesn't lose information (``1.0 -->
1846+
1``). You can pass ``convert_float=False`` to disable this behavior, which
1847+
may give a slight performance improvement.
1848+
18421849
.. _io.excel.writers:
18431850

18441851
Excel writer engines

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@ Improvements to existing features
207207
closed])
208208
- Fixed bug in `tools.plotting.andrews_curvres` so that lines are drawn grouped
209209
by color as expected.
210+
- ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int
211+
by default. (:issue:`5394`)
210212

211213
API Changes
212214
~~~~~~~~~~~

pandas/io/excel.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def read_excel(io, sheetname, **kwds):
8383
engine: string, default None
8484
If io is not a buffer or path, this must be set to identify io.
8585
Acceptable values are None or xlrd
86+
convert_float : boolean, default True
87+
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
88+
data will be read in as floats: Excel stores all numbers as floats
89+
internally.
8690
8791
Returns
8892
-------
@@ -142,7 +146,7 @@ def __init__(self, io, **kwds):
142146
def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
143147
index_col=None, parse_cols=None, parse_dates=False,
144148
date_parser=None, na_values=None, thousands=None, chunksize=None,
145-
**kwds):
149+
convert_float=True, **kwds):
146150
"""Read an Excel table into DataFrame
147151
148152
Parameters
@@ -172,6 +176,10 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
172176
NaN values are overridden, otherwise they're appended to
173177
verbose : boolean, default False
174178
Indicate number of NA values placed in non-numeric columns
179+
convert_float : boolean, default True
180+
convert integral floats to int (i.e., 1.0 --> 1). If False, all
181+
numeric data will be read in as floats: Excel stores all numbers as
182+
floats internally.
175183
176184
Returns
177185
-------
@@ -191,7 +199,9 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
191199
parse_dates=parse_dates,
192200
date_parser=date_parser, na_values=na_values,
193201
thousands=thousands, chunksize=chunksize,
194-
skip_footer=skip_footer, **kwds)
202+
skip_footer=skip_footer,
203+
convert_float=convert_float,
204+
**kwds)
195205

196206
def _should_parse(self, i, parse_cols):
197207

@@ -229,9 +239,11 @@ def _excel2num(x):
229239
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
230240
index_col=None, has_index_names=None, parse_cols=None,
231241
parse_dates=False, date_parser=None, na_values=None,
232-
thousands=None, chunksize=None, **kwds):
242+
thousands=None, chunksize=None, convert_float=True,
243+
**kwds):
233244
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
234-
XL_CELL_ERROR, XL_CELL_BOOLEAN)
245+
XL_CELL_ERROR, XL_CELL_BOOLEAN,
246+
XL_CELL_NUMBER)
235247

236248
datemode = self.book.datemode
237249
if isinstance(sheetname, compat.string_types):
@@ -260,6 +272,13 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
260272
value = np.nan
261273
elif typ == XL_CELL_BOOLEAN:
262274
value = bool(value)
275+
elif convert_float and typ == XL_CELL_NUMBER:
276+
# GH5394 - Excel 'numbers' are always floats
277+
# it's a minimal perf hit and less suprising
278+
val = int(value)
279+
if val == value:
280+
value = val
281+
263282
row.append(value)
264283

265284
data.append(row)

pandas/io/tests/data/test_types.xls

25.5 KB
Binary file not shown.

pandas/io/tests/data/test_types.xlsx

33 KB
Binary file not shown.

pandas/io/tests/test_excel.py

+75-26
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# pylint: disable=E1101
22

33
from pandas.compat import u, range, map
4+
from datetime import datetime
45
import os
56
import unittest
67

@@ -306,6 +307,56 @@ def test_reader_closes_file(self):
306307

307308
self.assertTrue(f.closed)
308309

310+
def test_reader_special_dtypes(self):
311+
_skip_if_no_xlrd()
312+
313+
expected = DataFrame.from_items([
314+
("IntCol", [1, 2, -3, 4, 0]),
315+
("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
316+
("BoolCol", [True, False, True, True, False]),
317+
("StrCol", [1, 2, 3, 4, 5]),
318+
# GH5394 - this is why convert_float isn't vectorized
319+
("Str2Col", ["a", 3, "c", "d", "e"]),
320+
("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
321+
datetime(1905, 1, 1), datetime(2013, 12, 14),
322+
datetime(2015, 3, 14)])
323+
])
324+
325+
xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
326+
xls_path = os.path.join(self.dirpath, 'test_types.xls')
327+
328+
# should read in correctly and infer types
329+
for path in (xls_path, xlsx_path):
330+
actual = read_excel(path, 'Sheet1')
331+
tm.assert_frame_equal(actual, expected)
332+
333+
# if not coercing number, then int comes in as float
334+
float_expected = expected.copy()
335+
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
336+
float_expected.loc[1, "Str2Col"] = 3.0
337+
for path in (xls_path, xlsx_path):
338+
actual = read_excel(path, 'Sheet1', convert_float=False)
339+
tm.assert_frame_equal(actual, float_expected)
340+
341+
# check setting Index (assuming xls and xlsx are the same here)
342+
for icol, name in enumerate(expected.columns):
343+
actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
344+
actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
345+
exp = expected.set_index(name)
346+
tm.assert_frame_equal(actual, exp)
347+
tm.assert_frame_equal(actual2, exp)
348+
349+
# convert_float and converters should be different but both accepted
350+
expected["StrCol"] = expected["StrCol"].apply(str)
351+
actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
352+
tm.assert_frame_equal(actual, expected)
353+
354+
no_convert_float = float_expected.copy()
355+
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
356+
actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str},
357+
convert_float=False)
358+
tm.assert_frame_equal(actual, no_convert_float)
359+
309360

310361
class ExcelWriterBase(SharedItems):
311362
# Base class for test cases to run with different Excel writers.
@@ -390,7 +441,7 @@ def test_roundtrip(self):
390441
tm.assert_frame_equal(self.frame, recons)
391442

392443
self.frame.to_excel(path, 'test1', na_rep='88')
393-
recons = read_excel(path, 'test1', index_col=0, na_values=[88,88.0])
444+
recons = read_excel(path, 'test1', index_col=0, na_values=[88, 88.0])
394445
tm.assert_frame_equal(self.frame, recons)
395446

396447
def test_mixed(self):
@@ -417,6 +468,16 @@ def test_tsframe(self):
417468
recons = reader.parse('test1')
418469
tm.assert_frame_equal(df, recons)
419470

471+
def test_basics_with_nan(self):
472+
_skip_if_no_xlrd()
473+
ext = self.ext
474+
path = '__tmp_to_excel_from_excel_int_types__.' + ext
475+
self.frame['A'][:5] = nan
476+
self.frame.to_excel(path, 'test1')
477+
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
478+
self.frame.to_excel(path, 'test1', header=False)
479+
self.frame.to_excel(path, 'test1', index=False)
480+
420481
def test_int_types(self):
421482
_skip_if_no_xlrd()
422483
ext = self.ext
@@ -425,20 +486,22 @@ def test_int_types(self):
425486
for np_type in (np.int8, np.int16, np.int32, np.int64):
426487

427488
with ensure_clean(path) as path:
428-
self.frame['A'][:5] = nan
429-
430-
self.frame.to_excel(path, 'test1')
431-
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
432-
self.frame.to_excel(path, 'test1', header=False)
433-
self.frame.to_excel(path, 'test1', index=False)
434-
435-
# Test np.int values read come back as float.
489+
# Test np.int values read come back as int (rather than float
490+
# which is Excel's format).
436491
frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)),
437492
dtype=np_type)
438493
frame.to_excel(path, 'test1')
439494
reader = ExcelFile(path)
440-
recons = reader.parse('test1').astype(np_type)
441-
tm.assert_frame_equal(frame, recons, check_dtype=False)
495+
recons = reader.parse('test1')
496+
int_frame = frame.astype(int)
497+
tm.assert_frame_equal(int_frame, recons)
498+
recons2 = read_excel(path, 'test1')
499+
tm.assert_frame_equal(int_frame, recons2)
500+
501+
# test with convert_float=False comes back as float
502+
float_frame = frame.astype(float)
503+
recons = read_excel(path, 'test1', convert_float=False)
504+
tm.assert_frame_equal(recons, float_frame)
442505

443506
def test_float_types(self):
444507
_skip_if_no_xlrd()
@@ -447,13 +510,6 @@ def test_float_types(self):
447510

448511
for np_type in (np.float16, np.float32, np.float64):
449512
with ensure_clean(path) as path:
450-
self.frame['A'][:5] = nan
451-
452-
self.frame.to_excel(path, 'test1')
453-
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
454-
self.frame.to_excel(path, 'test1', header=False)
455-
self.frame.to_excel(path, 'test1', index=False)
456-
457513
# Test np.float values read come back as float.
458514
frame = DataFrame(np.random.random_sample(10), dtype=np_type)
459515
frame.to_excel(path, 'test1')
@@ -468,13 +524,6 @@ def test_bool_types(self):
468524

469525
for np_type in (np.bool8, np.bool_):
470526
with ensure_clean(path) as path:
471-
self.frame['A'][:5] = nan
472-
473-
self.frame.to_excel(path, 'test1')
474-
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
475-
self.frame.to_excel(path, 'test1', header=False)
476-
self.frame.to_excel(path, 'test1', index=False)
477-
478527
# Test np.bool values read come back as float.
479528
frame = (DataFrame([1, 0, True, False], dtype=np_type))
480529
frame.to_excel(path, 'test1')
@@ -1007,11 +1056,11 @@ def test_ExcelWriter_dispatch(self):
10071056
writer = ExcelWriter('apple.xls')
10081057
tm.assert_isinstance(writer, _XlwtWriter)
10091058

1010-
10111059
def test_register_writer(self):
10121060
# some awkward mocking to test out dispatch and such actually works
10131061
called_save = []
10141062
called_write_cells = []
1063+
10151064
class DummyClass(ExcelWriter):
10161065
called_save = False
10171066
called_write_cells = False

0 commit comments

Comments
 (0)