Skip to content

Commit 8f417ac

Browse files
committed
now sectionwise: date_converter: excel / date_parser pandas-dev#4332
1 parent dc3ead3 commit 8f417ac

File tree

2 files changed

+170
-6
lines changed

2 files changed

+170
-6
lines changed

pandas/io/excel.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,18 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
127127
skipfooter = kwds.pop('skipfooter', None)
128128
if skipfooter is not None:
129129
skip_footer = skipfooter
130-
131-
return self._parse_excel(sheetname, header=header, skiprows=skiprows,
130+
131+
# this now gives back a df
132+
res = self._parse_excel(sheetname, header=header, skiprows=skiprows,
132133
index_col=index_col,
133134
has_index_names=has_index_names,
134135
parse_cols=parse_cols,
135136
parse_dates=parse_dates,
136137
date_parser=date_parser, na_values=na_values,
137138
thousands=thousands, chunksize=chunksize,
138139
skip_footer=skip_footer, **kwds)
140+
141+
return res
139142

140143
def _should_parse(self, i, parse_cols):
141144

@@ -195,11 +198,24 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
195198
if parse_cols is None or should_parse[j]:
196199
if typ == XL_CELL_DATE:
197200
dt = xldate_as_tuple(value, datemode)
201+
198202
# how to produce this first case?
203+
# if the year is ZERO then values are time/hours
199204
if dt[0] < datetime.MINYEAR: # pragma: no cover
200-
value = datetime.time(*dt[3:])
205+
datemode = 1
206+
dt = xldate_as_tuple(value, datemode)
207+
208+
value = datetime.time(*dt[3:])
209+
210+
211+
#or insert a full date
201212
else:
202213
value = datetime.datetime(*dt)
214+
215+
#apply eventual date_parser correction
216+
if date_parser:
217+
value = date_parser(value)
218+
203219
elif typ == XL_CELL_ERROR:
204220
value = np.nan
205221
elif typ == XL_CELL_BOOLEAN:
@@ -221,8 +237,15 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
221237
skip_footer=skip_footer,
222238
chunksize=chunksize,
223239
**kwds)
240+
res = parser.read()
241+
242+
if header is not None:
243+
244+
if len(data[header]) == len(res.columns.tolist()):
245+
res.columns = data[header]
246+
224247

225-
return parser.read()
248+
return res
226249

227250
@property
228251
def sheet_names(self):

pandas/io/tests/test_excel.py

+143-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# pylint: disable=E1101
22

33
from pandas.compat import StringIO, BytesIO, PY3, u, range, map
4-
from datetime import datetime
4+
#from datetime import datetime
55
from os.path import split as psplit
66
import csv
77
import os
@@ -14,7 +14,7 @@
1414
from numpy import nan
1515
import numpy as np
1616

17-
from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
17+
from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex, datetime
1818
import pandas.io.parsers as parsers
1919
from pandas.io.parsers import (read_csv, read_table, read_fwf,
2020
TextParser, TextFileReader)
@@ -66,6 +66,78 @@ def _skip_if_no_excelsuite():
6666
_skip_if_no_openpyxl()
6767

6868

69+
def _skip_if_no_mpl():
70+
'''pandas.tseries.converter imports matplotlib'''
71+
try:
72+
import matplotlib
73+
except ImportError:
74+
raise nose.SkipTest('matplotlib not installed, skipping')
75+
76+
77+
def _offset_time(value, offset=-10):
78+
'''appply corrective time offset in minutes
79+
80+
input
81+
-----
82+
value : datetime.time
83+
offset : integer value in minutes
84+
'''
85+
# if a excel time like '23.07.2013 24:00' they actually mean
86+
# in Python '23.07.2013 23:59', must be converted
87+
# offset = -10 # minutes
88+
_skip_if_no_mpl()
89+
from pandas.io.date_converters import offset_datetime
90+
ti_corr = offset_datetime(value, minutes=offset)
91+
# combine the corrected time component with the datetime
92+
# dt_comb = dt.datetime.combine(dt_now, ti_corr)
93+
94+
#since input is time, we return it.
95+
#TODO:
96+
#it is actually very strange that Pandas does consider an index
97+
#of datetime.time as index of objects and not time
98+
99+
return ti_corr
100+
101+
102+
def _correct_date_time(value):
103+
'''corrects the times in the Excel test file to Python time
104+
'''
105+
_skip_if_no_xlrd()
106+
_skip_if_no_mpl()
107+
from pandas.io.date_converters import dt2ti
108+
109+
# if a excel time like '24:00' it converted to 23.07.2013 00:00'
110+
# here, we just want the time component,
111+
# since all inputs shall be equal
112+
value = dt2ti(value)
113+
114+
#apply offset
115+
value = _offset_time(value)
116+
117+
return value
118+
119+
120+
def read_excel_cell(filename):
121+
'''read the excel cells into a dt object'''
122+
_skip_if_no_xlrd()
123+
# NameError: global name 'xlrd' is not defined
124+
from xlrd import open_workbook, xldate_as_tuple
125+
import datetime as dt
126+
wb = open_workbook(filename)
127+
sh = wb.sheet_by_name('min')
128+
#get first time stamp
129+
#TODO: the start row is: 12
130+
ti_start = xldate_as_tuple(sh.row(12)[1].value, 1)
131+
#get first last stamp
132+
ti_end = xldate_as_tuple(sh.row(155)[1].value, 1)
133+
134+
#as timestamp
135+
ti_start = dt.time(*ti_start[3:])
136+
ti_end = dt.time(*ti_end[3:])
137+
138+
return (ti_start, ti_end)
139+
140+
69141
_seriesd = tm.getSeriesData()
70142
_tsd = tm.getTimeSeriesData()
71143
_frame = DataFrame(_seriesd)[:10]
@@ -295,6 +367,75 @@ def test_xlsx_table(self):
295367
tm.assert_frame_equal(df4, df.ix[:-1])
296368
tm.assert_frame_equal(df4, df5)
297369

370+
def test_xlsx_table_hours(self):
371+
#check if the hours are read incorrectly
372+
_skip_if_no_xlrd()
373+
_skip_if_no_openpyxl()
374+
_skip_if_no_mpl()
375+
import datetime as dt
376+
377+
378+
379+
# 1900 datemode file
380+
filename = 'example_file_2013-07-25.xlsx'
381+
pth = os.path.join(self.dirpath, filename)
382+
xlsx = ExcelFile(pth)
383+
# parse_dates=False is necessary to obtain right sorting of rows in df
384+
# TODO: this must actually be skiprows=11, header=10
385+
# df =xlsx.parse('min', skiprows=12, header=10, index_col=1,
386+
# parse_dates=False, date_parser=correct_date_time)
387+
df =xlsx.parse('min', skiprows=12, header=10, index_col=1,
388+
parse_dates=False, date_parser=_correct_date_time)
389+
390+
df_start = df.index[0]
391+
df_end = df.index[-1:]
392+
# test: are the first/last index equal to the cell read in diretly by xlrd
393+
excel_cells = read_excel_cell(pth)
394+
395+
xl_start = _offset_time(excel_cells[0])
396+
xl_end = _offset_time(excel_cells[1])
397+
398+
self.assertEqual(df_start, xl_start)
399+
self.assertEqual(df_end, xl_end)
400+
401+
#test Excel 1904 datemode
402+
filename_1904 = 'example_file_2013-07-25_1904-dates.xlsx'
403+
pth = os.path.join(self.dirpath, filename_1904)
404+
xlsx = ExcelFile(pth)
405+
# parse_dates=False is necessary to obtain right sorting of roes in df
406+
# TODO: this must actually be skiprows=11
407+
df =xlsx.parse('min', skiprows=12, header=10, index_col=1,
408+
parse_dates=False, date_parser=_correct_date_time)
409+
410+
df_start = df.index[0]
411+
df_end = df.index[-1:]
412+
413+
excel_cells = read_excel_cell(pth)
414+
xl_start = _offset_time(excel_cells[0])
415+
xl_end = _offset_time(excel_cells[1])
416+
417+
# test: are the first/last index equal to the cell read in diretly
418+
self.assertEqual(df_start, xl_start)
419+
self.assertEqual(df_end, xl_end)
420+
421+
# test if a produced datetime is equal to a datetime directly produced by xlrd
422+
daydt_str = filename.split('.')[0][-10:]
423+
daydt = dt.datetime.strptime(daydt_str, '%Y-%m-%d')
424+
#
425+
df['date'] = daydt
426+
df['time'] = df.index
427+
428+
#TODO review this
429+
# df['datetime'] = df.apply(lambda x: pd.datetime.combine(x['date'], x['time'], axis=1))
430+
431+
# df.set_index(['datetime'])
432+
# import datetime as dt
433+
# dt_test = dt.datetime.combine(daydt, excel_cells[1])
434+
435+
# pdt_test = df.index[-1]
436+
437+
# self.assertEqual(dt_test, pdt_test)
438+
298439
def test_specify_kind_xls(self):
299440
_skip_if_no_xlrd()
300441
xlsx_file = os.path.join(self.dirpath, 'test.xlsx')

0 commit comments

Comments
 (0)