Skip to content

Commit 98e25fc

Browse files
committed
CLN: Add unicode compatibility wrapper for dateutil.
Dateutil < 2.0 doesn't always handle unicode well. This wraps `dateutil.parser.parse` and converts unicode to bytes. +wrap get_filepath_or_buffer in `str`
1 parent 27b601e commit 98e25fc

File tree

10 files changed

+35
-22
lines changed

10 files changed

+35
-22
lines changed

pandas/compat/__init__.py

+10
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
# pylint disable=W0611
2929
import functools
3030
import itertools
31+
from distutils.version import LooseVersion
3132
from itertools import product
3233
import sys
3334
import types
@@ -663,6 +664,15 @@ def __and__(self, other):
663664
# http://stackoverflow.com/questions/4126348
664665
# Thanks to @martineau at SO
665666

667+
from dateutil import parser as _date_parser
668+
import dateutil
669+
if LooseVersion(dateutil.__version__) < '2.0':
670+
@functools.wraps(_date_parser.parse)
671+
def parse_date(timestr, *args, **kwargs):
672+
timestr = bytes(timestr)
673+
return _date_parser.parse(timestr, *args, **kwargs)
674+
else:
675+
parse_date = _date_parser.parse
666676

667677
class OrderedDefaultdict(OrderedDict):
668678

pandas/core/datetools.py

-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from pandas.tseries.tools import *
44
from pandas.tseries.offsets import *
55
from pandas.tseries.frequencies import *
6-
from dateutil import parser
76

87
day = DateOffset()
98
bday = BDay()

pandas/io/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
8282
"""
8383

8484
if _is_url(filepath_or_buffer):
85-
req = _urlopen(filepath_or_buffer)
85+
req = _urlopen(str(filepath_or_buffer))
8686
if compat.PY3: # pragma: no cover
8787
if encoding:
8888
errors = 'strict'

pandas/io/tests/test_parsers.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import pandas.util.testing as tm
2828
import pandas as pd
2929

30+
from pandas.compat import parse_date
3031
import pandas.lib as lib
3132
from pandas import compat
3233
from pandas.lib import Timestamp
@@ -1254,13 +1255,13 @@ def test_converters(self):
12541255
b,3,4,01/02/2009
12551256
c,4,5,01/03/2009
12561257
"""
1257-
from dateutil import parser
1258+
from pandas.compat import parse_date
12581259

1259-
result = self.read_csv(StringIO(data), converters={'D': parser.parse})
1260-
result2 = self.read_csv(StringIO(data), converters={3: parser.parse})
1260+
result = self.read_csv(StringIO(data), converters={'D': parse_date})
1261+
result2 = self.read_csv(StringIO(data), converters={3: parse_date})
12611262

12621263
expected = self.read_csv(StringIO(data))
1263-
expected['D'] = expected['D'].map(parser.parse)
1264+
expected['D'] = expected['D'].map(parse_date)
12641265

12651266
tm.assert_isinstance(result['D'][0], (datetime, Timestamp))
12661267
tm.assert_frame_equal(result, expected)
@@ -1327,13 +1328,12 @@ def test_read_csv_parse_simple_list(self):
13271328
tm.assert_frame_equal(df, expected)
13281329

13291330
def test_parse_dates_custom_euroformat(self):
1330-
from dateutil.parser import parse
13311331
text = """foo,bar,baz
13321332
31/01/2010,1,2
13331333
01/02/2010,1,NA
13341334
02/02/2010,1,2
13351335
"""
1336-
parser = lambda d: parse(d, dayfirst=True)
1336+
parser = lambda d: parse_date(d, dayfirst=True)
13371337
df = self.read_csv(StringIO(text),
13381338
names=['time', 'Q', 'NTU'], header=0,
13391339
index_col=0, parse_dates=True,
@@ -1345,7 +1345,7 @@ def test_parse_dates_custom_euroformat(self):
13451345
index=exp_index, columns=['Q', 'NTU'])
13461346
tm.assert_frame_equal(df, expected)
13471347

1348-
parser = lambda d: parse(d, day_first=True)
1348+
parser = lambda d: parse_date(d, day_first=True)
13491349
self.assertRaises(Exception, self.read_csv,
13501350
StringIO(text), skiprows=[0],
13511351
names=['time', 'Q', 'NTU'], index_col=0,

pandas/tseries/tests/test_timeseries.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,7 @@ def test_fillna_nat(self):
732732
def test_string_na_nat_conversion(self):
733733
# GH #999, #858
734734

735-
from dateutil.parser import parse
735+
from pandas.compat import parse_date
736736

737737
strings = np.array(['1/1/2000', '1/2/2000', np.nan,
738738
'1/4/2000, 12:34:56'], dtype=object)
@@ -742,7 +742,7 @@ def test_string_na_nat_conversion(self):
742742
if com.isnull(val):
743743
expected[i] = iNaT
744744
else:
745-
expected[i] = parse(val)
745+
expected[i] = parse_date(val)
746746

747747
result = tslib.array_to_datetime(strings)
748748
assert_almost_equal(result, expected)

pandas/tseries/tools.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
238238
parsed, reso = dateutil_parse(arg, default, dayfirst=dayfirst,
239239
yearfirst=yearfirst)
240240
except Exception as e:
241+
# TODO: allow raise of errors within instead
241242
raise DateParseError(e)
242243

243244
if parsed is None:
@@ -252,19 +253,25 @@ def dateutil_parse(timestr, default,
252253
""" lifted from dateutil to get resolution"""
253254
from dateutil import tz
254255
import time
256+
fobj = StringIO(str(timestr))
255257

256-
res = DEFAULTPARSER._parse(StringIO(timestr), **kwargs)
258+
res = DEFAULTPARSER._parse(fobj, **kwargs)
257259

258260
if res is None:
259261
raise ValueError("unknown string format")
260262

261263
repl = {}
264+
reso = None
262265
for attr in ["year", "month", "day", "hour",
263266
"minute", "second", "microsecond"]:
264267
value = getattr(res, attr)
265268
if value is not None:
266269
repl[attr] = value
267270
reso = attr
271+
272+
if reso is None:
273+
raise ValueError("Cannot parse date.")
274+
268275
if reso == 'microsecond' and repl['microsecond'] == 0:
269276
reso = 'second'
270277

pandas/tslib.pyx

+3-5
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ cimport cython
2828

2929
from datetime import timedelta, datetime
3030
from datetime import time as datetime_time
31-
from dateutil.parser import parse as parse_date
31+
from pandas.compat import parse_date
3232

3333
cdef extern from "Python.h":
3434
int PySlice_Check(object)
@@ -852,8 +852,6 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
852852
_TSObject _ts
853853
int64_t m = cast_from_unit(unit,None)
854854

855-
from dateutil.parser import parse
856-
857855
try:
858856
result = np.empty(n, dtype='M8[ns]')
859857
iresult = result.view('i8')
@@ -917,7 +915,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
917915
elif raise_:
918916
raise
919917
try:
920-
result[i] = parse(val, dayfirst=dayfirst)
918+
result[i] = parse_date(val, dayfirst=dayfirst)
921919
except Exception:
922920
if coerce:
923921
iresult[i] = iNaT
@@ -946,7 +944,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
946944
oresult[i] = 'NaT'
947945
continue
948946
try:
949-
oresult[i] = parse(val, dayfirst=dayfirst)
947+
oresult[i] = parse_date(val, dayfirst=dayfirst)
950948
except Exception:
951949
if raise_:
952950
raise

scripts/find_commits_touching_func.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import re
1717
import os
1818
from collections import namedtuple
19-
from dateutil import parser
19+
from pandas.compat import parse_date
2020

2121
try:
2222
import sh
@@ -98,7 +98,7 @@ def get_commit_info(c,fmt,sep='\t'):
9898

9999
def get_commit_vitals(c,hlen=HASH_LEN):
100100
h,s,d= get_commit_info(c,'%H\t%s\t%ci',"\t")
101-
return h[:hlen],s,parser.parse(d)
101+
return h[:hlen],s,parse_date(d)
102102

103103
def file_filter(state,dirname,fnames):
104104
if args.dir_masks and not any([re.search(x,dirname) for x in args.dir_masks]):

scripts/git_code_churn.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from dateutil import parser
21
import subprocess
32
import os
43
import re

vb_suite/test_perf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ def main():
466466
def _parse_commit_log(this,repo_path,base_commit=None):
467467
from vbench.git import _convert_timezones
468468
from pandas import Series
469-
from dateutil import parser as dparser
469+
from pandas.compat import parse_date
470470

471471
git_cmd = 'git --git-dir=%s/.git --work-tree=%s ' % (repo_path, repo_path)
472472
githist = git_cmd + ('log --graph --pretty=format:'+
@@ -488,7 +488,7 @@ def _parse_commit_log(this,repo_path,base_commit=None):
488488
_, sha, stamp, message, author = line.split('::', 4)
489489

490490
# parse timestamp into datetime object
491-
stamp = dparser.parse(stamp)
491+
stamp = parse_date(stamp)
492492

493493
shas.append(sha)
494494
timestamps.append(stamp)

0 commit comments

Comments
 (0)