Skip to content

Commit 1ea301e

Browse files
committed
TST: Test for float --> int in the middle of object dtype
1 parent f490600 commit 1ea301e

File tree

6 files changed

+14
-2
lines changed

6 files changed

+14
-2
lines changed

doc/source/io.rst

+7
Original file line numberDiff line numberDiff line change
@@ -1839,6 +1839,13 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
18391839
df1.to_excel(writer, sheet_name='Sheet1')
18401840
df2.to_excel(writer, sheet_name='Sheet2')
18411841
1842+
.. note:: Wringing a little more performance out of ``read_excel``
1843+
Internally, Excel stores all numeric data as floats. Because this can
1844+
produce unexpected behavior when reading in data, pandas defaults to trying
1845+
to convert integers to floats if it doesn't lose information (``1.0 -->
1846+
1``). You can pass ``convert_float=False`` to disable this behavior, which
1847+
may give a slight performance improvement.
1848+
18421849
.. _io.excel.writers:
18431850

18441851
Excel writer engines

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@ Improvements to existing features
207207
closed])
208208
- Fixed bug in `tools.plotting.andrews_curvres` so that lines are drawn grouped
209209
by color as expected.
210+
- ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int
211+
by default. (:issue:`5394`)
210212

211213
API Changes
212214
~~~~~~~~~~~

pandas/io/excel.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,8 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
273273
elif typ == XL_CELL_BOOLEAN:
274274
value = bool(value)
275275
elif convert_float and typ == XL_CELL_NUMBER:
276-
# Excel 'numbers' are always floats
276+
# GH5394 - Excel 'numbers' are always floats
277+
# it's a minimal perf hit and less suprising
277278
val = int(value)
278279
if val == value:
279280
value = val

pandas/io/tests/data/test_types.xls

0 Bytes
Binary file not shown.

pandas/io/tests/data/test_types.xlsx

47 Bytes
Binary file not shown.

pandas/io/tests/test_excel.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,8 @@ def test_reader_special_dtypes(self):
315315
("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
316316
("BoolCol", [True, False, True, True, False]),
317317
("StrCol", [1, 2, 3, 4, 5]),
318-
("Str2Col", ["a", "b", "c", "d", "e"]),
318+
# GH5394 - this is why convert_float isn't vectorized
319+
("Str2Col", ["a", 3, "c", "d", "e"]),
319320
("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
320321
datetime(1905, 1, 1), datetime(2013, 12, 14),
321322
datetime(2015, 3, 14)])
@@ -332,6 +333,7 @@ def test_reader_special_dtypes(self):
332333
# if not coercing number, then int comes in as float
333334
float_expected = expected.copy()
334335
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
336+
float_expected.loc[1, "Str2Col"] = 3.0
335337
for path in (xls_path, xlsx_path):
336338
actual = read_excel(path, 'Sheet1', convert_float=False)
337339
tm.assert_frame_equal(actual, float_expected)

0 commit comments

Comments
 (0)