Skip to content

Commit f490600

Browse files
committed
ENH: read_excel: try converting numeric to int
All Excel numeric data is stored as floats, so have to convert specifically. Changing default because it's suprising to save something with what looks like a row/column of integers and get a column of floats instead. (especially because it can lead to annoying Float64Indexes)
1 parent e893f43 commit f490600

File tree

4 files changed

+95
-30
lines changed

4 files changed

+95
-30
lines changed

pandas/io/excel.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def read_excel(io, sheetname, **kwds):
8383
engine: string, default None
8484
If io is not a buffer or path, this must be set to identify io.
8585
Acceptable values are None or xlrd
86+
convert_float : boolean, default True
87+
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
88+
data will be read in as floats: Excel stores all numbers as floats
89+
internally.
8690
8791
Returns
8892
-------
@@ -142,7 +146,7 @@ def __init__(self, io, **kwds):
142146
def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
143147
index_col=None, parse_cols=None, parse_dates=False,
144148
date_parser=None, na_values=None, thousands=None, chunksize=None,
145-
**kwds):
149+
convert_float=True, **kwds):
146150
"""Read an Excel table into DataFrame
147151
148152
Parameters
@@ -172,6 +176,10 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
172176
NaN values are overridden, otherwise they're appended to
173177
verbose : boolean, default False
174178
Indicate number of NA values placed in non-numeric columns
179+
convert_float : boolean, default True
180+
convert integral floats to int (i.e., 1.0 --> 1). If False, all
181+
numeric data will be read in as floats: Excel stores all numbers as
182+
floats internally.
175183
176184
Returns
177185
-------
@@ -191,7 +199,9 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
191199
parse_dates=parse_dates,
192200
date_parser=date_parser, na_values=na_values,
193201
thousands=thousands, chunksize=chunksize,
194-
skip_footer=skip_footer, **kwds)
202+
skip_footer=skip_footer,
203+
convert_float=convert_float,
204+
**kwds)
195205

196206
def _should_parse(self, i, parse_cols):
197207

@@ -229,9 +239,11 @@ def _excel2num(x):
229239
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
230240
index_col=None, has_index_names=None, parse_cols=None,
231241
parse_dates=False, date_parser=None, na_values=None,
232-
thousands=None, chunksize=None, **kwds):
242+
thousands=None, chunksize=None, convert_float=True,
243+
**kwds):
233244
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
234-
XL_CELL_ERROR, XL_CELL_BOOLEAN)
245+
XL_CELL_ERROR, XL_CELL_BOOLEAN,
246+
XL_CELL_NUMBER)
235247

236248
datemode = self.book.datemode
237249
if isinstance(sheetname, compat.string_types):
@@ -260,6 +272,12 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
260272
value = np.nan
261273
elif typ == XL_CELL_BOOLEAN:
262274
value = bool(value)
275+
elif convert_float and typ == XL_CELL_NUMBER:
276+
# Excel 'numbers' are always floats
277+
val = int(value)
278+
if val == value:
279+
value = val
280+
263281
row.append(value)
264282

265283
data.append(row)

pandas/io/tests/data/test_types.xls

25.5 KB
Binary file not shown.

pandas/io/tests/data/test_types.xlsx

32.9 KB
Binary file not shown.

pandas/io/tests/test_excel.py

+73-26
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# pylint: disable=E1101
22

33
from pandas.compat import u, range, map
4+
from datetime import datetime
45
import os
56
import unittest
67

@@ -306,6 +307,54 @@ def test_reader_closes_file(self):
306307

307308
self.assertTrue(f.closed)
308309

310+
def test_reader_special_dtypes(self):
311+
_skip_if_no_xlrd()
312+
313+
expected = DataFrame.from_items([
314+
("IntCol", [1, 2, -3, 4, 0]),
315+
("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
316+
("BoolCol", [True, False, True, True, False]),
317+
("StrCol", [1, 2, 3, 4, 5]),
318+
("Str2Col", ["a", "b", "c", "d", "e"]),
319+
("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
320+
datetime(1905, 1, 1), datetime(2013, 12, 14),
321+
datetime(2015, 3, 14)])
322+
])
323+
324+
xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
325+
xls_path = os.path.join(self.dirpath, 'test_types.xls')
326+
327+
# should read in correctly and infer types
328+
for path in (xls_path, xlsx_path):
329+
actual = read_excel(path, 'Sheet1')
330+
tm.assert_frame_equal(actual, expected)
331+
332+
# if not coercing number, then int comes in as float
333+
float_expected = expected.copy()
334+
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
335+
for path in (xls_path, xlsx_path):
336+
actual = read_excel(path, 'Sheet1', convert_float=False)
337+
tm.assert_frame_equal(actual, float_expected)
338+
339+
# check setting Index (assuming xls and xlsx are the same here)
340+
for icol, name in enumerate(expected.columns):
341+
actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
342+
actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
343+
exp = expected.set_index(name)
344+
tm.assert_frame_equal(actual, exp)
345+
tm.assert_frame_equal(actual2, exp)
346+
347+
# convert_float and converters should be different but both accepted
348+
expected["StrCol"] = expected["StrCol"].apply(str)
349+
actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
350+
tm.assert_frame_equal(actual, expected)
351+
352+
no_convert_float = float_expected.copy()
353+
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
354+
actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str},
355+
convert_float=False)
356+
tm.assert_frame_equal(actual, no_convert_float)
357+
309358

310359
class ExcelWriterBase(SharedItems):
311360
# Base class for test cases to run with different Excel writers.
@@ -390,7 +439,7 @@ def test_roundtrip(self):
390439
tm.assert_frame_equal(self.frame, recons)
391440

392441
self.frame.to_excel(path, 'test1', na_rep='88')
393-
recons = read_excel(path, 'test1', index_col=0, na_values=[88,88.0])
442+
recons = read_excel(path, 'test1', index_col=0, na_values=[88, 88.0])
394443
tm.assert_frame_equal(self.frame, recons)
395444

396445
def test_mixed(self):
@@ -417,6 +466,16 @@ def test_tsframe(self):
417466
recons = reader.parse('test1')
418467
tm.assert_frame_equal(df, recons)
419468

469+
def test_basics_with_nan(self):
470+
_skip_if_no_xlrd()
471+
ext = self.ext
472+
path = '__tmp_to_excel_from_excel_int_types__.' + ext
473+
self.frame['A'][:5] = nan
474+
self.frame.to_excel(path, 'test1')
475+
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
476+
self.frame.to_excel(path, 'test1', header=False)
477+
self.frame.to_excel(path, 'test1', index=False)
478+
420479
def test_int_types(self):
421480
_skip_if_no_xlrd()
422481
ext = self.ext
@@ -425,20 +484,22 @@ def test_int_types(self):
425484
for np_type in (np.int8, np.int16, np.int32, np.int64):
426485

427486
with ensure_clean(path) as path:
428-
self.frame['A'][:5] = nan
429-
430-
self.frame.to_excel(path, 'test1')
431-
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
432-
self.frame.to_excel(path, 'test1', header=False)
433-
self.frame.to_excel(path, 'test1', index=False)
434-
435-
# Test np.int values read come back as float.
487+
# Test np.int values read come back as int (rather than float
488+
# which is Excel's format).
436489
frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)),
437490
dtype=np_type)
438491
frame.to_excel(path, 'test1')
439492
reader = ExcelFile(path)
440-
recons = reader.parse('test1').astype(np_type)
441-
tm.assert_frame_equal(frame, recons, check_dtype=False)
493+
recons = reader.parse('test1')
494+
int_frame = frame.astype(int)
495+
tm.assert_frame_equal(int_frame, recons)
496+
recons2 = read_excel(path, 'test1')
497+
tm.assert_frame_equal(int_frame, recons2)
498+
499+
# test with convert_float=False comes back as float
500+
float_frame = frame.astype(float)
501+
recons = read_excel(path, 'test1', convert_float=False)
502+
tm.assert_frame_equal(recons, float_frame)
442503

443504
def test_float_types(self):
444505
_skip_if_no_xlrd()
@@ -447,13 +508,6 @@ def test_float_types(self):
447508

448509
for np_type in (np.float16, np.float32, np.float64):
449510
with ensure_clean(path) as path:
450-
self.frame['A'][:5] = nan
451-
452-
self.frame.to_excel(path, 'test1')
453-
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
454-
self.frame.to_excel(path, 'test1', header=False)
455-
self.frame.to_excel(path, 'test1', index=False)
456-
457511
# Test np.float values read come back as float.
458512
frame = DataFrame(np.random.random_sample(10), dtype=np_type)
459513
frame.to_excel(path, 'test1')
@@ -468,13 +522,6 @@ def test_bool_types(self):
468522

469523
for np_type in (np.bool8, np.bool_):
470524
with ensure_clean(path) as path:
471-
self.frame['A'][:5] = nan
472-
473-
self.frame.to_excel(path, 'test1')
474-
self.frame.to_excel(path, 'test1', cols=['A', 'B'])
475-
self.frame.to_excel(path, 'test1', header=False)
476-
self.frame.to_excel(path, 'test1', index=False)
477-
478525
# Test np.bool values read come back as float.
479526
frame = (DataFrame([1, 0, True, False], dtype=np_type))
480527
frame.to_excel(path, 'test1')
@@ -1007,11 +1054,11 @@ def test_ExcelWriter_dispatch(self):
10071054
writer = ExcelWriter('apple.xls')
10081055
tm.assert_isinstance(writer, _XlwtWriter)
10091056

1010-
10111057
def test_register_writer(self):
10121058
# some awkward mocking to test out dispatch and such actually works
10131059
called_save = []
10141060
called_write_cells = []
1061+
10151062
class DummyClass(ExcelWriter):
10161063
called_save = False
10171064
called_write_cells = False

0 commit comments

Comments
 (0)