Skip to content

now sectionwise: excel / date_parser #4332 #4634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions pandas/io/date_converters.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""This module is designed for community supported date conversion functions"""
from datetime import datetime, timedelta, time

from pandas.compat import range
import numpy as np
import pandas.lib as lib
Expand Down Expand Up @@ -56,3 +58,76 @@ def _check_columns(cols):
raise AssertionError()

return N


## Datetime Conversion for date_parsers
## see also: create a community supported set of typical converters
## https://github.com/pydata/pandas/issues/1180

def offset_datetime(dt_in, days=0, hours=0, minutes=0,
seconds=0, microseconds=0):
'''appply corrective time offset using datetime.timedelta

input
-----
dt_in : datetime.time or datetime.datetime object
days : integer value (positive or negative) for days component of offset
hours : integer value (positive or negative) for hours component of offset
minutes : integer value (positive or negative) for
minutes component of offset
seconds : integer value (positive or negative) for
seconds component of offset
microseconds : integer value (positive or negative) for
microseconds component of offset

output
------
ti_corr : datetime.time or datetime.datetime object


'''
# if a excel time like '23.07.2013 24:00' they actually mean
# in Python '23.07.2013 23:59', must be converted
# offset = -10 # minutes
delta = timedelta(days=days, hours=hours, minutes=minutes,
seconds=seconds, microseconds=microseconds)

#check if offset it to me applied on datetime or time
if type(dt_in) is time:
#create psydo datetime
dt_now = datetime.now()
dt_base = datetime.combine(dt_now, dt_in)
else:
dt_base = dt_in

dt_corr = (dt_base) + delta

#if input is time, we return it.
if type(dt_in) is time:
dt_corr = dt_corr.time()

return dt_corr


def dt2ti(dt_in):
'''converts wrong datetime.datetime to datetime.time

input
-----
dt_in : dt_in : datetime.time or datetime.datetime object

output
-------
ti_corr : datetime.time object
'''
# so we correct those which are not of type :mod:datetime.time
# impdt2tiortant hint:
# http://stackoverflow.com/a/12906456
if type(dt_in) is not time:
dt_in = dt_in.time()
elif type(dt_in) is datetime:
dt_in = dt_in.time()
else:
pass

return dt_in
31 changes: 27 additions & 4 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,18 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
skipfooter = kwds.pop('skipfooter', None)
if skipfooter is not None:
skip_footer = skipfooter

return self._parse_excel(sheetname, header=header, skiprows=skiprows,

# this now gives back a df
res = self._parse_excel(sheetname, header=header, skiprows=skiprows,
index_col=index_col,
has_index_names=has_index_names,
parse_cols=parse_cols,
parse_dates=parse_dates,
date_parser=date_parser, na_values=na_values,
thousands=thousands, chunksize=chunksize,
skip_footer=skip_footer, **kwds)

return res

def _should_parse(self, i, parse_cols):

Expand Down Expand Up @@ -195,11 +198,24 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
if parse_cols is None or should_parse[j]:
if typ == XL_CELL_DATE:
dt = xldate_as_tuple(value, datemode)

# how to produce this first case?
# if the year is ZERO then values are time/hours
if dt[0] < datetime.MINYEAR: # pragma: no cover
value = datetime.time(*dt[3:])
datemode = 1
dt = xldate_as_tuple(value, datemode)

value = datetime.time(*dt[3:])


#or insert a full date
else:
value = datetime.datetime(*dt)

#apply eventual date_parser correction
if date_parser:
value = date_parser(value)

elif typ == XL_CELL_ERROR:
value = np.nan
elif typ == XL_CELL_BOOLEAN:
Expand All @@ -221,8 +237,15 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
skip_footer=skip_footer,
chunksize=chunksize,
**kwds)
res = parser.read()

if header is not None:

if len(data[header]) == len(res.columns.tolist()):
res.columns = data[header]


return parser.read()
return res

@property
def sheet_names(self):
Expand Down
32 changes: 30 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1150,7 +1150,11 @@ def TextParser(*args, **kwds):
returns Series if only one column
"""
kwds['engine'] = 'python'
return TextFileReader(*args, **kwds)

res = TextFileReader(*args, **kwds)


return res
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is unnecessary, please revert this part.


# delimiter=None, dialect=None, names=None, header=0,
# index_col=None,
Expand Down Expand Up @@ -1385,6 +1389,7 @@ def _convert_data(self, data):
clean_conv)

def _infer_columns(self):
#TODO: this full part is too complex and somewhat strage!!!
names = self.names

if self.header is not None:
Expand All @@ -1396,13 +1401,20 @@ def _infer_columns(self):
header = list(header) + [header[-1]+1]
else:
have_mi_columns = False
#TODO: explain why header (in this case 1 number) needs to be a list???
header = [ header ]

columns = []
for level, hr in enumerate(header):

#TODO: explain why self.buf is needed.
# the header is correctly retrieved in excel.py by
# data[header] = _trim_excel_header(data[header])
if len(self.buf) > 0:
line = self.buf[0]

elif (header[0] == hr) and (level == 0) and (header[0] > 0):
line = self._get_header()

else:
line = self._next_line()

Expand Down Expand Up @@ -1456,8 +1468,24 @@ def _infer_columns(self):
columns = [ names ]

return columns

def _get_header(self):
''' reads header if e.g. header
FIXME: this tshoul be turned into something much less complicates
FIXME: all due to the header assuming that there is never a row between
data and header
'''
if isinstance(self.data, list):
line = self.data[self.header]
self.pos = self.header +1
else:
line = self._next_line()

return line

def _next_line(self):
#FIXME: why is self.data at times a list and sometimes a _scv.reader??
# reduce complexity here!!!
if isinstance(self.data, list):
while self.pos in self.skiprows:
self.pos += 1
Expand Down
155 changes: 155 additions & 0 deletions pandas/io/test_date_converters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
from pandas.compat import StringIO, BytesIO
from datetime import datetime, time, timedelta, date
import csv
import os
import sys
import re
import unittest

import nose

from numpy import nan
import numpy as np
from numpy.testing.decorators import slow

from pandas import DataFrame, Series, Index, isnull
import pandas.io.parsers as parsers
from pandas.io.parsers import (read_csv, read_table, read_fwf,
TextParser)
from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
assert_series_equal, network)
import pandas.lib as lib
from pandas import compat
from pandas.lib import Timestamp
import pandas.io.date_converters as conv


class TestConverters(unittest.TestCase):

def setUp(self):
self.years = np.array([2007, 2008])
self.months = np.array([1, 2])
self.days = np.array([3, 4])
self.hours = np.array([5, 6])
self.minutes = np.array([7, 8])
self.seconds = np.array([9, 0])
self.dates = np.array(['2007/1/3', '2008/2/4'], dtype=object)
self.times = np.array(['05:07:09', '06:08:00'], dtype=object)
self.expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
datetime(2008, 2, 4, 6, 8, 0)])

def test_parse_date_time(self):
result = conv.parse_date_time(self.dates, self.times)
self.assert_((result == self.expected).all())

data = """\
date, time, a, b
2001-01-05, 10:00:00, 0.0, 10.
2001-01-05, 00:00:00, 1., 11.
"""
datecols = {'date_time': [0, 1]}
df = read_table(StringIO(data), sep=',', header=0,
parse_dates=datecols, date_parser=conv.parse_date_time)
self.assert_('date_time' in df)
self.assert_(df.date_time.ix[0] == datetime(2001, 1, 5, 10, 0, 0))

data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
"KORD,19990127, 23:00:00, 22:56:00, -0.5900")

date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
df = read_csv(StringIO(data), header=None, parse_dates=date_spec,
date_parser=conv.parse_date_time)

def test_parse_date_fields(self):
result = conv.parse_date_fields(self.years, self.months, self.days)
expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)])
self.assert_((result == expected).all())

data = "year, month, day, a\n 2001 , 01 , 10 , 10.\n 2001 , 02 , 1 , 11."
datecols = {'ymd': [0, 1, 2]}
df = read_table(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=conv.parse_date_fields)
self.assert_('ymd' in df)
self.assert_(df.ymd.ix[0] == datetime(2001, 1, 10))

def test_datetime_six_col(self):
result = conv.parse_all_fields(self.years, self.months, self.days,
self.hours, self.minutes, self.seconds)
self.assert_((result == self.expected).all())

data = """\
year, month, day, hour, minute, second, a, b
2001, 01, 05, 10, 00, 0, 0.0, 10.
2001, 01, 5, 10, 0, 00, 1., 11.
"""
datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
df = read_table(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=conv.parse_all_fields)
self.assert_('ymdHMS' in df)
self.assert_(df.ymdHMS.ix[0] == datetime(2001, 1, 5, 10, 0, 0))

def test_datetime_fractional_seconds(self):
data = """\
year, month, day, hour, minute, second, a, b
2001, 01, 05, 10, 00, 0.123456, 0.0, 10.
2001, 01, 5, 10, 0, 0.500000, 1., 11.
"""
datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
df = read_table(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=conv.parse_all_fields)
self.assert_('ymdHMS' in df)
self.assert_(df.ymdHMS.ix[0] == datetime(2001, 1, 5, 10, 0, 0,
microsecond=123456))
self.assert_(df.ymdHMS.ix[1] == datetime(2001, 1, 5, 10, 0, 0,
microsecond=500000))

def test_generic(self):
data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11."
datecols = {'ym': [0, 1]}
dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1)
df = read_table(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=dateconverter)
self.assert_('ym' in df)
self.assert_(df.ym.ix[0] == date(2001, 1, 1))

def test_offset_datetime(self):
#test with a datetime.datetime object
dt_in = datetime(2013, 1, 1, 1, 10, 10, 100000)
dt_target = datetime(2013, 1, 2, 6, 20, 40, 100600)
dt_res = conv.offset_datetime(dt_in, days=1, hours=5, minutes=10,
seconds=30, microseconds=600)

assert(dt_res == dt_target)
#test with a datetime.time object
ti_in = time(1, 10, 20, 100000)
ti_target = time(6, 20, 50, 100600)
ti_res = conv.offset_datetime(ti_in, hours=5, minutes=10,
seconds=30, microseconds=600)
assert(ti_res == ti_target)

def test_dt2ti(self):
#a datetime.datetime object
dt_in = datetime(2013, 1, 1, 1, 10, 10, 100000)
ti_target = time(1, 10, 10, 100000)
dt2ti_dt_res = conv.dt2ti(dt_in)
assert(ti_target == dt2ti_dt_res)

#a datetime.time object
ti_in = time(1, 10, 20, 100000)
ti_target_dt2ti = time(1, 10, 20, 100000)
dt2ti_ti_res = conv.dt2ti(ti_in)
assert(ti_target_dt2ti == dt2ti_ti_res)


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
Binary file added pandas/io/tests/data/example_file_2013-07-25.xlsx
Binary file not shown.
Binary file not shown.
Loading