diff --git a/LICENSES/DATEUTIL_LICENSE b/LICENSES/DATEUTIL_LICENSE new file mode 100644 index 0000000000000..6053d35cfc60b --- /dev/null +++ b/LICENSES/DATEUTIL_LICENSE @@ -0,0 +1,54 @@ +Copyright 2017- Paul Ganssle +Copyright 2017- dateutil contributors (see AUTHORS file) + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +The above license applies to all contributions after 2017-12-01, as well as +all contributions that have been re-licensed (see AUTHORS file for the list of +contributors who have re-licensed their code). +-------------------------------------------------------------------------------- +dateutil - Extensions to the standard Python datetime module. + +Copyright (c) 2003-2011 - Gustavo Niemeyer +Copyright (c) 2012-2014 - Tomi Pieviläinen +Copyright (c) 2014-2016 - Yaron de Leeuw +Copyright (c) 2015- - Paul Ganssle +Copyright (c) 2015- - dateutil contributors (see AUTHORS file) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The above BSD License Applies to all code, even that also covered by Apache 2.0. diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 3a03018141f5a..82719de2dbdbd 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -11,6 +11,9 @@ from cpython.datetime cimport datetime import numpy as np +import six +from six import binary_type, text_type + # Avoid import from outside _libs if sys.version_info.major == 2: from StringIO import StringIO @@ -531,21 +534,83 @@ def try_parse_datetime_components(object[:] years, # ---------------------------------------------------------------------- # Miscellaneous -_DATEUTIL_LEXER_SPLIT = None -try: - # Since these are private methods from dateutil, it is safely imported - # here so in case this interface changes, pandas will just fallback - # to not using the functionality - from dateutil.parser import _timelex - - if hasattr(_timelex, 'split'): - def _lexer_split_from_str(dt_str): - # The StringIO(str(_)) is for dateutil 2.2 compatibility - return _timelex.split(StringIO(str(dt_str))) - _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str -except (ImportError, AttributeError): - pass +# Class copied verbatim from https://github.com/dateutil/dateutil/pull/732 +# +# We use this class to parse and tokenize date strings. However, as it is +# a private class in the dateutil library, relying on backwards compatibility +# is not practical. In fact, using this class issues warnings (xref gh-21322). +# Thus, we port the class over so that both issues are resolved. +# +# Copyright (c) 2017 - dateutil contributors +class _timelex(object): + def __init__(self, instream): + if six.PY2: + # In Python 2, we can't duck type properly because unicode has + # a 'decode' function, and we'd be double-decoding + if isinstance(instream, (binary_type, bytearray)): + instream = instream.decode() + else: + if getattr(instream, 'decode', None) is not None: + instream = instream.decode() + + if isinstance(instream, text_type): + self.stream = instream + elif getattr(instream, 'read', None) is None: + raise TypeError( + 'Parser must be a string or character stream, not ' + '{itype}'.format(itype=instream.__class__.__name__)) + else: + self.stream = instream.read() + + def get_tokens(self): + """ + This function breaks the time string into lexical units (tokens), which + can be parsed by the parser. Lexical units are demarcated by changes in + the character set, so any continuous string of letters is considered + one unit, any continuous string of numbers is considered one unit. + The main complication arises from the fact that dots ('.') can be used + both as separators (e.g. "Sep.20.2009") or decimal points (e.g. + "4:30:21.447"). As such, it is necessary to read the full context of + any dot-separated strings before breaking it into tokens; as such, this + function maintains a "token stack", for when the ambiguous context + demands that multiple tokens be parsed at once. + """ + stream = self.stream.replace('\x00', '') + + # TODO: Change \s --> \s+ (this doesn't match existing behavior) + # TODO: change the punctuation block to punc+ (doesnt match existing) + # TODO: can we merge the two digit patterns? + tokens = re.findall('\s|' + '(? bint: diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 530da1a625af4..deb1850a8b483 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1243,8 +1243,6 @@ def test_dayfirst(self, cache): class TestGuessDatetimeFormat(object): @td.skip_if_not_us_locale - @pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning") - # https://github.com/pandas-dev/pandas/issues/21322 def test_guess_datetime_format_for_array(self): expected_format = '%Y-%m-%d %H:%M:%S.%f' dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index f2b0ae98aff98..45a841cd1136d 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -94,7 +94,6 @@ def test_parsers_monthfreq(self): assert result1 == expected -@pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning") class TestGuessDatetimeFormat(object): @td.skip_if_not_us_locale @@ -163,8 +162,6 @@ def test_guess_datetime_format_invalid_inputs(self): ('2011-1-1 00:00:00', '%Y-%m-%d %H:%M:%S'), ('2011-1-1 0:0:0', '%Y-%m-%d %H:%M:%S'), ('2011-1-3T00:00:0', '%Y-%m-%dT%H:%M:%S')]) - # https://github.com/pandas-dev/pandas/issues/21322 for _timelex - @pytest.mark.filterwarnings("ignore:_timelex:DeprecationWarning") def test_guess_datetime_format_nopadding(self, string, format): # GH 11142 result = parsing._guess_datetime_format(string)