Skip to content

BUG: Fix to read decimal seconds from Excel. #6934

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 24, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ Improvements to existing features
- Translate ``sep='\s+'`` to ``delim_whitespace=True`` in
:func:`read_csv`/:func:`read_table` if no other C-unsupported options
specified (:issue:`6607`)
- ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`)

.. _release.bug_fixes-0.14.0:

Expand Down
45 changes: 36 additions & 9 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pandas.compat as compat
import pandas.core.common as com
from warnings import warn
from distutils.version import LooseVersion

__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]

Expand Down Expand Up @@ -250,11 +251,19 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
parse_dates=False, date_parser=None, na_values=None,
thousands=None, chunksize=None, convert_float=True,
**kwds):
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
import xlrd
from xlrd import (xldate, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)

datemode = self.book.datemode
epoch1904 = self.book.datemode

# xlrd >= 0.9.3 can return datetime objects directly.
if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
xlrd_0_9_3 = True
else:
xlrd_0_9_3 = False

if isinstance(sheetname, compat.string_types):
sheet = self.book.sheet_by_name(sheetname)
else: # assume an integer if not a string
Expand All @@ -271,12 +280,29 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,

if parse_cols is None or should_parse[j]:
if typ == XL_CELL_DATE:
dt = xldate_as_tuple(value, datemode)
# how to produce this first case?
if dt[0] < datetime.MINYEAR: # pragma: no cover
value = datetime.time(*dt[3:])
if xlrd_0_9_3:
# Use the newer xlrd datetime handling.
value = xldate.xldate_as_datetime(value, epoch1904)

# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (value.timetuple())[0:3]
if ((not epoch1904 and year == (1899, 12, 31))
or (epoch1904 and year == (1904, 1, 1))):
value = datetime.time(value.hour,
value.minute,
value.second,
value.microsecond)
else:
value = datetime.datetime(*dt)
# Use the xlrd <= 0.9.2 date handling.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this works, just doesn't handle decimal seconds right? is it appropriate to raise if you detect that? (can you even)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, correct. xlrd in previous versions rounded the milliseconds into the seconds. It isn't possible to detect (at least not easily) since all dates and times are stored as floats (with no distinction).

dt = xldate.xldate_as_tuple(value, epoch1904)

if dt[0] < datetime.MINYEAR:
value = datetime.time(*dt[3:])
else:
value = datetime.datetime(*dt)

elif typ == XL_CELL_ERROR:
value = np.nan
elif typ == XL_CELL_BOOLEAN:
Expand Down Expand Up @@ -727,8 +753,9 @@ def __init__(self, path, engine=None,
import xlsxwriter

super(_XlsxWriter, self).__init__(path, engine=engine,
date_format=date_format, datetime_format=datetime_format,
**engine_kwargs)
date_format=date_format,
datetime_format=datetime_format,
**engine_kwargs)

self.book = xlsxwriter.Workbook(path, **engine_kwargs)

Expand Down
Binary file added pandas/io/tests/data/times_1900.xls
Binary file not shown.
Binary file added pandas/io/tests/data/times_1904.xls
Binary file not shown.
54 changes: 49 additions & 5 deletions pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# pylint: disable=E1101

from pandas.compat import u, range, map
from datetime import datetime, date
from datetime import datetime, date, time
import os
from distutils.version import LooseVersion

import nose

Expand Down Expand Up @@ -360,6 +361,49 @@ def test_reader_special_dtypes(self):
convert_float=False)
tm.assert_frame_equal(actual, no_convert_float)

def test_reader_seconds(self):
# Test reading times with and without milliseconds. GH5945.
_skip_if_no_xlrd()
import xlrd

if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
# Xlrd >= 0.9.3 can handle Excel milliseconds.
expected = DataFrame.from_items([("Time",
[time(1, 2, 3),
time(2, 45, 56, 100000),
time(4, 29, 49, 200000),
time(6, 13, 42, 300000),
time(7, 57, 35, 400000),
time(9, 41, 28, 500000),
time(11, 25, 21, 600000),
time(13, 9, 14, 700000),
time(14, 53, 7, 800000),
time(16, 37, 0, 900000),
time(18, 20, 54)])])
else:
# Xlrd < 0.9.3 rounds Excel milliseconds.
expected = DataFrame.from_items([("Time",
[time(1, 2, 3),
time(2, 45, 56),
time(4, 29, 49),
time(6, 13, 42),
time(7, 57, 35),
time(9, 41, 29),
time(11, 25, 22),
time(13, 9, 15),
time(14, 53, 8),
time(16, 37, 1),
time(18, 20, 54)])])

epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls')
epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls')

actual = read_excel(epoch_1900, 'Sheet1')
tm.assert_frame_equal(actual, expected)

actual = read_excel(epoch_1904, 'Sheet1')
tm.assert_frame_equal(actual, expected)


class ExcelWriterBase(SharedItems):
# Base class for test cases to run with different Excel writers.
Expand Down Expand Up @@ -400,7 +444,7 @@ def test_excel_deprecated_options(self):
with ensure_clean(self.ext) as path:
with tm.assert_produces_warning(FutureWarning):
self.frame.to_excel(path, 'test1', cols=['A', 'B'])

with tm.assert_produces_warning(False):
self.frame.to_excel(path, 'test1', columns=['A', 'B'])

Expand Down Expand Up @@ -832,9 +876,9 @@ def test_to_excel_output_encoding(self):
index=[u('A\u0192'), 'B'], columns=[u('X\u0193'), 'Y', 'Z'])

with ensure_clean(filename) as filename:
df.to_excel(filename, sheet_name = 'TestSheet', encoding='utf8')
result = read_excel(filename, 'TestSheet', encoding = 'utf8')
tm.assert_frame_equal(result,df)
df.to_excel(filename, sheet_name='TestSheet', encoding='utf8')
result = read_excel(filename, 'TestSheet', encoding='utf8')
tm.assert_frame_equal(result, df)


def test_to_excel_unicode_filename(self):
Expand Down