now sectionwise: date_converter: excel / date_parser pandas-dev#4332

timmie · timmie · commit 8f417acea87a · 2013-08-22T03:39:47.000+02:00
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -127,15 +127,18 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
         skipfooter = kwds.pop('skipfooter', None)
         if skipfooter is not None:
             skip_footer = skipfooter
-
-        return self._parse_excel(sheetname, header=header, skiprows=skiprows,
+        
+        # this now gives back a df
+        res = self._parse_excel(sheetname, header=header, skiprows=skiprows,
                                  index_col=index_col,
                                  has_index_names=has_index_names,
                                  parse_cols=parse_cols,
                                  parse_dates=parse_dates,
                                  date_parser=date_parser, na_values=na_values,
                                  thousands=thousands, chunksize=chunksize,
                                  skip_footer=skip_footer, **kwds)
+                    
+        return res
 
     def _should_parse(self, i, parse_cols):
 
@@ -195,11 +198,24 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
                 if parse_cols is None or should_parse[j]:
                     if typ == XL_CELL_DATE:
                         dt = xldate_as_tuple(value, datemode)
+                        
                         # how to produce this first case?
+                        # if the year is ZERO then values are time/hours
                         if dt[0] < datetime.MINYEAR:  # pragma: no cover
-                            value = datetime.time(*dt[3:])
+                            datemode = 1
+                            dt = xldate_as_tuple(value, datemode)
+                            
+                            value = datetime.time(*dt[3:])  
+                                     
+
+                        #or insert a full date
                         else:
                             value = datetime.datetime(*dt)
+                        
+                        #apply eventual date_parser correction
+                        if date_parser:
+                                value = date_parser(value)    
+                            
                     elif typ == XL_CELL_ERROR:
                         value = np.nan
                     elif typ == XL_CELL_BOOLEAN:
@@ -221,8 +237,15 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
                             skip_footer=skip_footer,
                             chunksize=chunksize,
                             **kwds)
+        res = parser.read() 
+        
+        if header is not None:
+
+            if len(data[header]) == len(res.columns.tolist()):
+                res.columns = data[header]
+        
 
-        return parser.read()
+        return res
 
     @property
     def sheet_names(self):
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -1,7 +1,7 @@
 # pylint: disable=E1101
 
 from pandas.compat import StringIO, BytesIO, PY3, u, range, map
-from datetime import datetime
+#from datetime import datetime
 from os.path import split as psplit
 import csv
 import os
@@ -14,7 +14,7 @@
 from numpy import nan
 import numpy as np
 
-from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
+from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex, datetime
 import pandas.io.parsers as parsers
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                 TextParser, TextFileReader)
@@ -66,6 +66,78 @@ def _skip_if_no_excelsuite():
     _skip_if_no_openpyxl()
 
 
+def _skip_if_no_mpl():
+    '''pandas.tseries.converter imports matplotlib'''
+    try:
+        import matplotlib
+    except ImportError:
+        raise nose.SkipTest('matplotlib not installed, skipping')
+
+
+def _offset_time(value, offset=-10):
+    '''appply corrective time offset in minutes
+
+    input
+    -----
+    value : datetime.time
+    offset : integer value in minutes
+    '''
+    # if a excel time like '23.07.2013 24:00' they actually mean
+    # in Python '23.07.2013 23:59', must be converted
+#            offset = -10 # minutes
+    _skip_if_no_mpl()
+    from pandas.io.date_converters import offset_datetime
+    ti_corr = offset_datetime(value, minutes=offset)
+    # combine the corrected time component with the datetime
+#            dt_comb = dt.datetime.combine(dt_now, ti_corr)
+
+    #since input is time, we return it.
+    #TODO:
+    #it is actually very strange that Pandas does consider an index
+    #of datetime.time as index of objects and not time
+
+    return ti_corr
+
+
+def _correct_date_time(value):
+    '''corrects the times in the Excel test file to Python time
+    '''
+    _skip_if_no_xlrd()
+    _skip_if_no_mpl()
+    from pandas.io.date_converters import dt2ti
+
+    # if a excel time like '24:00' it converted to 23.07.2013 00:00'
+    # here, we just want the time component,
+    # since all inputs shall be equal
+    value = dt2ti(value)
+
+    #apply offset
+    value = _offset_time(value)
+
+    return value
+
+
+def read_excel_cell(filename):
+    '''read the excel cells into a dt object'''
+    _skip_if_no_xlrd()
+    # NameError: global name 'xlrd' is not defined
+    from xlrd import open_workbook, xldate_as_tuple
+    import datetime as dt
+    wb = open_workbook(filename)
+    sh = wb.sheet_by_name('min')
+    #get first time stamp
+    #TODO: the start row is: 12
+    ti_start = xldate_as_tuple(sh.row(12)[1].value, 1)
+    #get first last stamp
+    ti_end = xldate_as_tuple(sh.row(155)[1].value, 1)
+
+    #as timestamp
+    ti_start = dt.time(*ti_start[3:])
+    ti_end = dt.time(*ti_end[3:])
+
+    return (ti_start, ti_end)
+
+
 _seriesd = tm.getSeriesData()
 _tsd = tm.getTimeSeriesData()
 _frame = DataFrame(_seriesd)[:10]
@@ -295,6 +367,75 @@ def test_xlsx_table(self):
         tm.assert_frame_equal(df4, df.ix[:-1])
         tm.assert_frame_equal(df4, df5)
 
+    def test_xlsx_table_hours(self):
+        #check if the hours are read incorrectly
+        _skip_if_no_xlrd()
+        _skip_if_no_openpyxl()
+        _skip_if_no_mpl()
+        import datetime as dt
+
+
+
+        # 1900 datemode file
+        filename = 'example_file_2013-07-25.xlsx'
+        pth = os.path.join(self.dirpath, filename)
+        xlsx = ExcelFile(pth)
+        # parse_dates=False is necessary to obtain right sorting of rows in df
+        # TODO: this must actually be skiprows=11, header=10
+#        df =xlsx.parse('min', skiprows=12, header=10, index_col=1,
+#                         parse_dates=False, date_parser=correct_date_time)
+        df =xlsx.parse('min', skiprows=12, header=10, index_col=1,
+                         parse_dates=False, date_parser=_correct_date_time)
+
+        df_start = df.index[0]
+        df_end = df.index[-1:]
+        # test: are the first/last index equal to the cell read in diretly by xlrd
+        excel_cells = read_excel_cell(pth)
+
+        xl_start = _offset_time(excel_cells[0])
+        xl_end = _offset_time(excel_cells[1])
+
+        self.assertEqual(df_start, xl_start)
+        self.assertEqual(df_end, xl_end)
+
+        #test Excel 1904 datemode
+        filename_1904 = 'example_file_2013-07-25_1904-dates.xlsx'
+        pth = os.path.join(self.dirpath, filename_1904)
+        xlsx = ExcelFile(pth)
+        # parse_dates=False is necessary to obtain right sorting of roes in df
+        # TODO: this must actually be skiprows=11
+        df =xlsx.parse('min', skiprows=12, header=10, index_col=1,
+                         parse_dates=False, date_parser=_correct_date_time)
+
+        df_start = df.index[0]
+        df_end = df.index[-1:]
+
+        excel_cells = read_excel_cell(pth)
+        xl_start = _offset_time(excel_cells[0])
+        xl_end = _offset_time(excel_cells[1])
+
+        # test: are the first/last index equal to the cell read in diretly
+        self.assertEqual(df_start, xl_start)
+        self.assertEqual(df_end, xl_end)
+
+        # test if a produced datetime is equal to a datetime directly produced by xlrd
+        daydt_str = filename.split('.')[0][-10:]
+        daydt = dt.datetime.strptime(daydt_str, '%Y-%m-%d')
+#
+        df['date'] = daydt
+        df['time'] = df.index
+
+        #TODO review this
+#        df['datetime'] = df.apply(lambda x: pd.datetime.combine(x['date'], x['time'], axis=1))
+
+#        df.set_index(['datetime'])
+#        import datetime as dt
+#        dt_test = dt.datetime.combine(daydt, excel_cells[1])
+
+#        pdt_test = df.index[-1]
+
+#        self.assertEqual(dt_test, pdt_test)
+
     def test_specify_kind_xls(self):
         _skip_if_no_xlrd()
         xlsx_file = os.path.join(self.dirpath, 'test.xlsx')