diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 2be477f49e28b..3aa9955a6affe 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,4 +1,6 @@ """This module is designed for community supported date conversion functions""" +from datetime import datetime, timedelta, time + from pandas.compat import range import numpy as np import pandas.lib as lib @@ -56,3 +58,76 @@ def _check_columns(cols): raise AssertionError() return N + + +## Datetime Conversion for date_parsers +## see also: create a community supported set of typical converters +## https://github.com/pydata/pandas/issues/1180 + +def offset_datetime(dt_in, days=0, hours=0, minutes=0, + seconds=0, microseconds=0): + '''appply corrective time offset using datetime.timedelta + + input + ----- + dt_in : datetime.time or datetime.datetime object + days : integer value (positive or negative) for days component of offset + hours : integer value (positive or negative) for hours component of offset + minutes : integer value (positive or negative) for + minutes component of offset + seconds : integer value (positive or negative) for + seconds component of offset + microseconds : integer value (positive or negative) for + microseconds component of offset + + output + ------ + ti_corr : datetime.time or datetime.datetime object + + + ''' + # if a excel time like '23.07.2013 24:00' they actually mean + # in Python '23.07.2013 23:59', must be converted +# offset = -10 # minutes + delta = timedelta(days=days, hours=hours, minutes=minutes, + seconds=seconds, microseconds=microseconds) + + #check if offset it to me applied on datetime or time + if type(dt_in) is time: + #create psydo datetime + dt_now = datetime.now() + dt_base = datetime.combine(dt_now, dt_in) + else: + dt_base = dt_in + + dt_corr = (dt_base) + delta + + #if input is time, we return it. + if type(dt_in) is time: + dt_corr = dt_corr.time() + + return dt_corr + + +def dt2ti(dt_in): + '''converts wrong datetime.datetime to datetime.time + + input + ----- + dt_in : dt_in : datetime.time or datetime.datetime object + + output + ------- + ti_corr : datetime.time object + ''' + # so we correct those which are not of type :mod:datetime.time + # impdt2tiortant hint: + # http://stackoverflow.com/a/12906456 + if type(dt_in) is not time: + dt_in = dt_in.time() + elif type(dt_in) is datetime: + dt_in = dt_in.time() + else: + pass + + return dt_in diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 534a88e303dbf..588450cb4c11f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -127,8 +127,9 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter - - return self._parse_excel(sheetname, header=header, skiprows=skiprows, + + # this now gives back a df + res = self._parse_excel(sheetname, header=header, skiprows=skiprows, index_col=index_col, has_index_names=has_index_names, parse_cols=parse_cols, @@ -136,6 +137,8 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, date_parser=date_parser, na_values=na_values, thousands=thousands, chunksize=chunksize, skip_footer=skip_footer, **kwds) + + return res def _should_parse(self, i, parse_cols): @@ -195,11 +198,24 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: dt = xldate_as_tuple(value, datemode) + # how to produce this first case? + # if the year is ZERO then values are time/hours if dt[0] < datetime.MINYEAR: # pragma: no cover - value = datetime.time(*dt[3:]) + datemode = 1 + dt = xldate_as_tuple(value, datemode) + + value = datetime.time(*dt[3:]) + + + #or insert a full date else: value = datetime.datetime(*dt) + + #apply eventual date_parser correction + if date_parser: + value = date_parser(value) + elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: @@ -221,8 +237,15 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, skip_footer=skip_footer, chunksize=chunksize, **kwds) + res = parser.read() + + if header is not None: + + if len(data[header]) == len(res.columns.tolist()): + res.columns = data[header] + - return parser.read() + return res @property def sheet_names(self): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3b132be800cb1..9d0581f9094bc 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1150,7 +1150,11 @@ def TextParser(*args, **kwds): returns Series if only one column """ kwds['engine'] = 'python' - return TextFileReader(*args, **kwds) + + res = TextFileReader(*args, **kwds) + + + return res # delimiter=None, dialect=None, names=None, header=0, # index_col=None, @@ -1385,6 +1389,7 @@ def _convert_data(self, data): clean_conv) def _infer_columns(self): + #TODO: this full part is too complex and somewhat strage!!! names = self.names if self.header is not None: @@ -1396,13 +1401,20 @@ def _infer_columns(self): header = list(header) + [header[-1]+1] else: have_mi_columns = False + #TODO: explain why header (in this case 1 number) needs to be a list??? header = [ header ] columns = [] for level, hr in enumerate(header): - + #TODO: explain why self.buf is needed. + # the header is correctly retrieved in excel.py by + # data[header] = _trim_excel_header(data[header]) if len(self.buf) > 0: line = self.buf[0] + + elif (header[0] == hr) and (level == 0) and (header[0] > 0): + line = self._get_header() + else: line = self._next_line() @@ -1456,8 +1468,24 @@ def _infer_columns(self): columns = [ names ] return columns + + def _get_header(self): + ''' reads header if e.g. header + FIXME: this tshoul be turned into something much less complicates + FIXME: all due to the header assuming that there is never a row between + data and header + ''' + if isinstance(self.data, list): + line = self.data[self.header] + self.pos = self.header +1 + else: + line = self._next_line() + + return line def _next_line(self): + #FIXME: why is self.data at times a list and sometimes a _scv.reader?? + # reduce complexity here!!! if isinstance(self.data, list): while self.pos in self.skiprows: self.pos += 1 diff --git a/pandas/io/test_date_converters.py b/pandas/io/test_date_converters.py new file mode 100644 index 0000000000000..7bba0212b1348 --- /dev/null +++ b/pandas/io/test_date_converters.py @@ -0,0 +1,155 @@ +from pandas.compat import StringIO, BytesIO +from datetime import datetime, time, timedelta, date +import csv +import os +import sys +import re +import unittest + +import nose + +from numpy import nan +import numpy as np +from numpy.testing.decorators import slow + +from pandas import DataFrame, Series, Index, isnull +import pandas.io.parsers as parsers +from pandas.io.parsers import (read_csv, read_table, read_fwf, + TextParser) +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal, network) +import pandas.lib as lib +from pandas import compat +from pandas.lib import Timestamp +import pandas.io.date_converters as conv + + +class TestConverters(unittest.TestCase): + + def setUp(self): + self.years = np.array([2007, 2008]) + self.months = np.array([1, 2]) + self.days = np.array([3, 4]) + self.hours = np.array([5, 6]) + self.minutes = np.array([7, 8]) + self.seconds = np.array([9, 0]) + self.dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) + self.times = np.array(['05:07:09', '06:08:00'], dtype=object) + self.expected = np.array([datetime(2007, 1, 3, 5, 7, 9), + datetime(2008, 2, 4, 6, 8, 0)]) + + def test_parse_date_time(self): + result = conv.parse_date_time(self.dates, self.times) + self.assert_((result == self.expected).all()) + + data = """\ +date, time, a, b +2001-01-05, 10:00:00, 0.0, 10. +2001-01-05, 00:00:00, 1., 11. +""" + datecols = {'date_time': [0, 1]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, date_parser=conv.parse_date_time) + self.assert_('date_time' in df) + self.assert_(df.date_time.ix[0] == datetime(2001, 1, 5, 10, 0, 0)) + + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + df = read_csv(StringIO(data), header=None, parse_dates=date_spec, + date_parser=conv.parse_date_time) + + def test_parse_date_fields(self): + result = conv.parse_date_fields(self.years, self.months, self.days) + expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) + self.assert_((result == expected).all()) + + data = "year, month, day, a\n 2001 , 01 , 10 , 10.\n 2001 , 02 , 1 , 11." + datecols = {'ymd': [0, 1, 2]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_date_fields) + self.assert_('ymd' in df) + self.assert_(df.ymd.ix[0] == datetime(2001, 1, 10)) + + def test_datetime_six_col(self): + result = conv.parse_all_fields(self.years, self.months, self.days, + self.hours, self.minutes, self.seconds) + self.assert_((result == self.expected).all()) + + data = """\ +year, month, day, hour, minute, second, a, b +2001, 01, 05, 10, 00, 0, 0.0, 10. +2001, 01, 5, 10, 0, 00, 1., 11. +""" + datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_all_fields) + self.assert_('ymdHMS' in df) + self.assert_(df.ymdHMS.ix[0] == datetime(2001, 1, 5, 10, 0, 0)) + + def test_datetime_fractional_seconds(self): + data = """\ +year, month, day, hour, minute, second, a, b +2001, 01, 05, 10, 00, 0.123456, 0.0, 10. +2001, 01, 5, 10, 0, 0.500000, 1., 11. +""" + datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_all_fields) + self.assert_('ymdHMS' in df) + self.assert_(df.ymdHMS.ix[0] == datetime(2001, 1, 5, 10, 0, 0, + microsecond=123456)) + self.assert_(df.ymdHMS.ix[1] == datetime(2001, 1, 5, 10, 0, 0, + microsecond=500000)) + + def test_generic(self): + data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." + datecols = {'ym': [0, 1]} + dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=dateconverter) + self.assert_('ym' in df) + self.assert_(df.ym.ix[0] == date(2001, 1, 1)) + + def test_offset_datetime(self): + #test with a datetime.datetime object + dt_in = datetime(2013, 1, 1, 1, 10, 10, 100000) + dt_target = datetime(2013, 1, 2, 6, 20, 40, 100600) + dt_res = conv.offset_datetime(dt_in, days=1, hours=5, minutes=10, + seconds=30, microseconds=600) + + assert(dt_res == dt_target) + #test with a datetime.time object + ti_in = time(1, 10, 20, 100000) + ti_target = time(6, 20, 50, 100600) + ti_res = conv.offset_datetime(ti_in, hours=5, minutes=10, + seconds=30, microseconds=600) + assert(ti_res == ti_target) + + def test_dt2ti(self): + #a datetime.datetime object + dt_in = datetime(2013, 1, 1, 1, 10, 10, 100000) + ti_target = time(1, 10, 10, 100000) + dt2ti_dt_res = conv.dt2ti(dt_in) + assert(ti_target == dt2ti_dt_res) + + #a datetime.time object + ti_in = time(1, 10, 20, 100000) + ti_target_dt2ti = time(1, 10, 20, 100000) + dt2ti_ti_res = conv.dt2ti(ti_in) + assert(ti_target_dt2ti == dt2ti_ti_res) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/data/example_file_2013-07-25.xlsx b/pandas/io/tests/data/example_file_2013-07-25.xlsx new file mode 100644 index 0000000000000..0d7b4f9e8f227 Binary files /dev/null and b/pandas/io/tests/data/example_file_2013-07-25.xlsx differ diff --git a/pandas/io/tests/data/example_file_2013-07-25_1904-dates.xlsx b/pandas/io/tests/data/example_file_2013-07-25_1904-dates.xlsx new file mode 100644 index 0000000000000..5889593e62908 Binary files /dev/null and b/pandas/io/tests/data/example_file_2013-07-25_1904-dates.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 3f41be6ae64c6..2074950aadbc5 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1,7 +1,7 @@ # pylint: disable=E1101 from pandas.compat import StringIO, BytesIO, PY3, u, range, map -from datetime import datetime +#from datetime import datetime from os.path import split as psplit import csv import os @@ -14,7 +14,7 @@ from numpy import nan import numpy as np -from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex +from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex, datetime import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, TextParser, TextFileReader) @@ -66,6 +66,78 @@ def _skip_if_no_excelsuite(): _skip_if_no_openpyxl() +def _skip_if_no_mpl(): + '''pandas.tseries.converter imports matplotlib''' + try: + import matplotlib + except ImportError: + raise nose.SkipTest('matplotlib not installed, skipping') + + +def _offset_time(value, offset=-10): + '''appply corrective time offset in minutes + + input + ----- + value : datetime.time + offset : integer value in minutes + ''' + # if a excel time like '23.07.2013 24:00' they actually mean + # in Python '23.07.2013 23:59', must be converted +# offset = -10 # minutes + _skip_if_no_mpl() + from pandas.io.date_converters import offset_datetime + ti_corr = offset_datetime(value, minutes=offset) + # combine the corrected time component with the datetime +# dt_comb = dt.datetime.combine(dt_now, ti_corr) + + #since input is time, we return it. + #TODO: + #it is actually very strange that Pandas does consider an index + #of datetime.time as index of objects and not time + + return ti_corr + + +def _correct_date_time(value): + '''corrects the times in the Excel test file to Python time + ''' + _skip_if_no_xlrd() + _skip_if_no_mpl() + from pandas.io.date_converters import dt2ti + + # if a excel time like '24:00' it converted to 23.07.2013 00:00' + # here, we just want the time component, + # since all inputs shall be equal + value = dt2ti(value) + + #apply offset + value = _offset_time(value) + + return value + + +def read_excel_cell(filename): + '''read the excel cells into a dt object''' + _skip_if_no_xlrd() + # NameError: global name 'xlrd' is not defined + from xlrd import open_workbook, xldate_as_tuple + import datetime as dt + wb = open_workbook(filename) + sh = wb.sheet_by_name('min') + #get first time stamp + #TODO: the start row is: 12 + ti_start = xldate_as_tuple(sh.row(12)[1].value, 1) + #get first last stamp + ti_end = xldate_as_tuple(sh.row(155)[1].value, 1) + + #as timestamp + ti_start = dt.time(*ti_start[3:]) + ti_end = dt.time(*ti_end[3:]) + + return (ti_start, ti_end) + + _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() _frame = DataFrame(_seriesd)[:10] @@ -295,6 +367,75 @@ def test_xlsx_table(self): tm.assert_frame_equal(df4, df.ix[:-1]) tm.assert_frame_equal(df4, df5) + def test_xlsx_table_hours(self): + #check if the hours are read incorrectly + _skip_if_no_xlrd() + _skip_if_no_openpyxl() + _skip_if_no_mpl() + import datetime as dt + + + + # 1900 datemode file + filename = 'example_file_2013-07-25.xlsx' + pth = os.path.join(self.dirpath, filename) + xlsx = ExcelFile(pth) + # parse_dates=False is necessary to obtain right sorting of rows in df + # TODO: this must actually be skiprows=11, header=10 +# df =xlsx.parse('min', skiprows=12, header=10, index_col=1, +# parse_dates=False, date_parser=correct_date_time) + df =xlsx.parse('min', skiprows=12, header=10, index_col=1, + parse_dates=False, date_parser=_correct_date_time) + + df_start = df.index[0] + df_end = df.index[-1:] + # test: are the first/last index equal to the cell read in diretly by xlrd + excel_cells = read_excel_cell(pth) + + xl_start = _offset_time(excel_cells[0]) + xl_end = _offset_time(excel_cells[1]) + + self.assertEqual(df_start, xl_start) + self.assertEqual(df_end, xl_end) + + #test Excel 1904 datemode + filename_1904 = 'example_file_2013-07-25_1904-dates.xlsx' + pth = os.path.join(self.dirpath, filename_1904) + xlsx = ExcelFile(pth) + # parse_dates=False is necessary to obtain right sorting of roes in df + # TODO: this must actually be skiprows=11 + df =xlsx.parse('min', skiprows=12, header=10, index_col=1, + parse_dates=False, date_parser=_correct_date_time) + + df_start = df.index[0] + df_end = df.index[-1:] + + excel_cells = read_excel_cell(pth) + xl_start = _offset_time(excel_cells[0]) + xl_end = _offset_time(excel_cells[1]) + + # test: are the first/last index equal to the cell read in diretly + self.assertEqual(df_start, xl_start) + self.assertEqual(df_end, xl_end) + + # test if a produced datetime is equal to a datetime directly produced by xlrd + daydt_str = filename.split('.')[0][-10:] + daydt = dt.datetime.strptime(daydt_str, '%Y-%m-%d') +# + df['date'] = daydt + df['time'] = df.index + + #TODO review this +# df['datetime'] = df.apply(lambda x: pd.datetime.combine(x['date'], x['time'], axis=1)) + +# df.set_index(['datetime']) +# import datetime as dt +# dt_test = dt.datetime.combine(daydt, excel_cells[1]) + +# pdt_test = df.index[-1] + +# self.assertEqual(dt_test, pdt_test) + def test_specify_kind_xls(self): _skip_if_no_xlrd() xlsx_file = os.path.join(self.dirpath, 'test.xlsx') diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 787682f340250..f225200c53a87 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1,4 +1,6 @@ # pylint: disable=E1101 +from __future__ import absolute_import + from datetime import datetime import csv @@ -38,6 +40,13 @@ from pandas.parser import OverflowError +def _skip_if_no_mpl(): + '''pandas.tseries.converter imports matplotlib''' + try: + import matplotlib + except ImportError: + raise nose.SkipTest('matplotlib not installed, skipping') + class ParserTests(object): """ @@ -2015,6 +2024,30 @@ def test_iteration_open_handle(self): expected = Series(['DDD', 'EEE', 'FFF', 'GGG']) tm.assert_series_equal(result, expected) + def test_infer_columns(self): + '''reads xls with certain order of header, skiprows / data''' + _skip_if_no_mpl() + from pandas.io.excel import ExcelFile + from . import test_excel + correct_date_time = test_excel._correct_date_time + test_excel._skip_if_no_excelsuite() + + # test of the header column is read in nicely + # list with the expected column names from the excel file + headercols_target = ['blank', 'temperature', 'precipitation', 'Area'] + + # add the block reading the excel file into a DataFrame + filename = 'example_file_2013-07-25.xlsx' + pth = os.path.join(self.dirpath, filename) + xlsx = ExcelFile(pth) + df = xlsx.parse('min', skiprows=12, header=10, index_col=1, + parse_dates=False, date_parser=correct_date_time) + #read in the excel file + headercols_df_in = df.columns.tolist() + + self.assertEqual(headercols_df_in, headercols_target) + + class TestCParserHighMemory(ParserTests, unittest.TestCase): def read_csv(self, *args, **kwds):