Skip to content

Commit 7b88b7b

Browse files
committed
MAINT: Port _timelex in codebase
Removes the DeprecationWarning raised by dateutil because it's a private class. Implementation taken from the following PR: dateutil/dateutil#732 Closes gh-21322.
1 parent bf9d41c commit 7b88b7b

File tree

4 files changed

+133
-19
lines changed

4 files changed

+133
-19
lines changed

LICENSES/DATEUTIL_LICENSE

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
Copyright 2017- Paul Ganssle <[email protected]>
2+
Copyright 2017- dateutil contributors (see AUTHORS file)
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
16+
The above license applies to all contributions after 2017-12-01, as well as
17+
all contributions that have been re-licensed (see AUTHORS file for the list of
18+
contributors who have re-licensed their code).
19+
--------------------------------------------------------------------------------
20+
dateutil - Extensions to the standard Python datetime module.
21+
22+
Copyright (c) 2003-2011 - Gustavo Niemeyer <[email protected]>
23+
Copyright (c) 2012-2014 - Tomi Pieviläinen <[email protected]>
24+
Copyright (c) 2014-2016 - Yaron de Leeuw <[email protected]>
25+
Copyright (c) 2015- - Paul Ganssle <[email protected]>
26+
Copyright (c) 2015- - dateutil contributors (see AUTHORS file)
27+
28+
All rights reserved.
29+
30+
Redistribution and use in source and binary forms, with or without
31+
modification, are permitted provided that the following conditions are met:
32+
33+
* Redistributions of source code must retain the above copyright notice,
34+
this list of conditions and the following disclaimer.
35+
* Redistributions in binary form must reproduce the above copyright notice,
36+
this list of conditions and the following disclaimer in the documentation
37+
and/or other materials provided with the distribution.
38+
* Neither the name of the copyright holder nor the names of its
39+
contributors may be used to endorse or promote products derived from
40+
this software without specific prior written permission.
41+
42+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
46+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
47+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
48+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53+
54+
The above BSD License Applies to all code, even that also covered by Apache 2.0.

pandas/_libs/tslibs/parsing.pyx

+79-14
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ from cpython.datetime cimport datetime
1111

1212
import numpy as np
1313

14+
import six
15+
from six import binary_type, text_type
16+
1417
# Avoid import from outside _libs
1518
if sys.version_info.major == 2:
1619
from StringIO import StringIO
@@ -531,21 +534,83 @@ def try_parse_datetime_components(object[:] years,
531534
# ----------------------------------------------------------------------
532535
# Miscellaneous
533536

534-
_DATEUTIL_LEXER_SPLIT = None
535-
try:
536-
# Since these are private methods from dateutil, it is safely imported
537-
# here so in case this interface changes, pandas will just fallback
538-
# to not using the functionality
539-
from dateutil.parser import _timelex
540-
541-
if hasattr(_timelex, 'split'):
542-
def _lexer_split_from_str(dt_str):
543-
# The StringIO(str(_)) is for dateutil 2.2 compatibility
544-
return _timelex.split(StringIO(str(dt_str)))
545537

546-
_DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
547-
except (ImportError, AttributeError):
548-
pass
538+
# Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
539+
#
540+
# Copyright (c) 2017 - Paul Ganssle <[email protected]>
541+
# Copyright (c) 2017 - dateutil contributors
542+
class _timelex(object):
543+
def __init__(self, instream):
544+
if six.PY2:
545+
# In Python 2, we can't duck type properly because unicode has
546+
# a 'decode' function, and we'd be double-decoding
547+
if isinstance(instream, (binary_type, bytearray)):
548+
instream = instream.decode()
549+
else:
550+
if getattr(instream, 'decode', None) is not None:
551+
instream = instream.decode()
552+
553+
if isinstance(instream, text_type):
554+
self.stream = instream
555+
elif getattr(instream, 'read', None) is None:
556+
raise TypeError(
557+
'Parser must be a string or character stream, not '
558+
'{itype}'.format(itype=instream.__class__.__name__))
559+
else:
560+
self.stream = instream.read()
561+
562+
def get_tokens(self):
563+
"""
564+
This function breaks the time string into lexical units (tokens), which
565+
can be parsed by the parser. Lexical units are demarcated by changes in
566+
the character set, so any continuous string of letters is considered
567+
one unit, any continuous string of numbers is considered one unit.
568+
The main complication arises from the fact that dots ('.') can be used
569+
both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
570+
"4:30:21.447"). As such, it is necessary to read the full context of
571+
any dot-separated strings before breaking it into tokens; as such, this
572+
function maintains a "token stack", for when the ambiguous context
573+
demands that multiple tokens be parsed at once.
574+
"""
575+
stream = self.stream.replace('\x00', '')
576+
577+
# TODO: Change \s --> \s+ (this doesn't match existing behavior)
578+
# TODO: change the punctuation block to punc+ (doesnt match existing)
579+
# TODO: can we merge the two digit patterns?
580+
tokens = re.findall('\s|'
581+
'(?<![\.\d])\d+\.\d+(?![\.\d])'
582+
'|\d+'
583+
'|[a-zA-Z]+'
584+
'|[\./:]+'
585+
'|[^\da-zA-Z\./:\s]+', stream)
586+
587+
# Re-combine token tuples of the form ["59", ",", "456"] because
588+
# in this context the "," is treated as a decimal
589+
# (e.g. in python's default logging format)
590+
for n, token in enumerate(tokens[:-2]):
591+
# Kludge to match ,-decimal behavior; it'd be better to do this
592+
# later in the process and have a simpler tokenization
593+
if (token is not None and token.isdigit() and
594+
tokens[n + 1] == ',' and tokens[n + 2].isdigit()):
595+
# Have to check None b/c it might be replaced during the loop
596+
# TODO: I _really_ don't faking the value here
597+
tokens[n] = token + '.' + tokens[n + 2]
598+
tokens[n + 1] = None
599+
tokens[n + 2] = None
600+
601+
tokens = [x for x in tokens if x is not None]
602+
return tokens
603+
604+
@classmethod
605+
def split(cls, s):
606+
return cls(s).get_tokens()
607+
608+
609+
def _lexer_split_from_str(dt_str):
610+
# The StringIO(str(_)) is for dateutil 2.2 compatibility
611+
return _timelex.split(StringIO(str(dt_str)))
612+
613+
_DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
549614

550615

551616
def _format_is_iso(f) -> bint:

pandas/tests/indexes/datetimes/test_tools.py

-2
Original file line numberDiff line numberDiff line change
@@ -1243,8 +1243,6 @@ def test_dayfirst(self, cache):
12431243
class TestGuessDatetimeFormat(object):
12441244

12451245
@td.skip_if_not_us_locale
1246-
@pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning")
1247-
# https://github.com/pandas-dev/pandas/issues/21322
12481246
def test_guess_datetime_format_for_array(self):
12491247
expected_format = '%Y-%m-%d %H:%M:%S.%f'
12501248
dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format)

pandas/tests/tslibs/test_parsing.py

-3
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ def test_parsers_monthfreq(self):
9494
assert result1 == expected
9595

9696

97-
@pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning")
9897
class TestGuessDatetimeFormat(object):
9998

10099
@td.skip_if_not_us_locale
@@ -163,8 +162,6 @@ def test_guess_datetime_format_invalid_inputs(self):
163162
('2011-1-1 00:00:00', '%Y-%m-%d %H:%M:%S'),
164163
('2011-1-1 0:0:0', '%Y-%m-%d %H:%M:%S'),
165164
('2011-1-3T00:00:0', '%Y-%m-%dT%H:%M:%S')])
166-
# https://github.com/pandas-dev/pandas/issues/21322 for _timelex
167-
@pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning")
168165
def test_guess_datetime_format_nopadding(self, string, format):
169166
# GH 11142
170167
result = parsing._guess_datetime_format(string)

0 commit comments

Comments
 (0)