@@ -11,6 +11,9 @@ from cpython.datetime cimport datetime
11
11
12
12
import numpy as np
13
13
14
+ import six
15
+ from six import binary_type, text_type
16
+
14
17
# Avoid import from outside _libs
15
18
if sys.version_info.major == 2 :
16
19
from StringIO import StringIO
@@ -531,21 +534,83 @@ def try_parse_datetime_components(object[:] years,
531
534
# ----------------------------------------------------------------------
532
535
# Miscellaneous
533
536
534
- _DATEUTIL_LEXER_SPLIT = None
535
- try :
536
- # Since these are private methods from dateutil, it is safely imported
537
- # here so in case this interface changes, pandas will just fallback
538
- # to not using the functionality
539
- from dateutil.parser import _timelex
540
-
541
- if hasattr (_timelex, ' split' ):
542
- def _lexer_split_from_str (dt_str ):
543
- # The StringIO(str(_)) is for dateutil 2.2 compatibility
544
- return _timelex.split(StringIO(str (dt_str)))
545
537
546
- _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
547
- except (ImportError , AttributeError ):
548
- pass
538
+ # Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
539
+ #
540
+ # We use this class to parse and tokenize date strings. However, as it is
541
+ # a private class in the dateutil library, relying on backwards compatibility
542
+ # is not practical. In fact, using this class issues warnings (xref gh-21322).
543
+ # Thus, we port the class over so that both issues are resolved.
544
+ #
545
+ # Copyright (c) 2017 - dateutil contributors
546
+ class _timelex (object ):
547
+ def __init__ (self , instream ):
548
+ if six.PY2:
549
+ # In Python 2, we can't duck type properly because unicode has
550
+ # a 'decode' function, and we'd be double-decoding
551
+ if isinstance (instream, (binary_type, bytearray)):
552
+ instream = instream.decode()
553
+ else :
554
+ if getattr (instream, ' decode' , None ) is not None :
555
+ instream = instream.decode()
556
+
557
+ if isinstance (instream, text_type):
558
+ self .stream = instream
559
+ elif getattr (instream, ' read' , None ) is None :
560
+ raise TypeError (
561
+ ' Parser must be a string or character stream, not '
562
+ ' {itype}' .format(itype = instream.__class__ .__name__ ))
563
+ else :
564
+ self .stream = instream.read()
565
+
566
+ def get_tokens (self ):
567
+ """
568
+ This function breaks the time string into lexical units (tokens), which
569
+ can be parsed by the parser. Lexical units are demarcated by changes in
570
+ the character set, so any continuous string of letters is considered
571
+ one unit, any continuous string of numbers is considered one unit.
572
+ The main complication arises from the fact that dots ('.') can be used
573
+ both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
574
+ "4:30:21.447"). As such, it is necessary to read the full context of
575
+ any dot-separated strings before breaking it into tokens; as such, this
576
+ function maintains a "token stack", for when the ambiguous context
577
+ demands that multiple tokens be parsed at once.
578
+ """
579
+ stream = self .stream.replace(' \x00 ' , ' ' )
580
+
581
+ # TODO: Change \s --> \s+ (this doesn't match existing behavior)
582
+ # TODO: change the punctuation block to punc+ (doesnt match existing)
583
+ # TODO: can we merge the two digit patterns?
584
+ tokens = re.findall(' \s|'
585
+ ' (?<![\.\d])\d+\.\d+(?![\.\d])'
586
+ ' |\d+'
587
+ ' |[a-zA-Z]+'
588
+ ' |[\./:]+'
589
+ ' |[^\da-zA-Z\./:\s]+' , stream)
590
+
591
+ # Re-combine token tuples of the form ["59", ",", "456"] because
592
+ # in this context the "," is treated as a decimal
593
+ # (e.g. in python's default logging format)
594
+ for n, token in enumerate (tokens[:- 2 ]):
595
+ # Kludge to match ,-decimal behavior; it'd be better to do this
596
+ # later in the process and have a simpler tokenization
597
+ if (token is not None and token.isdigit() and
598
+ tokens[n + 1 ] == ' ,' and tokens[n + 2 ].isdigit()):
599
+ # Have to check None b/c it might be replaced during the loop
600
+ # TODO: I _really_ don't faking the value here
601
+ tokens[n] = token + ' .' + tokens[n + 2 ]
602
+ tokens[n + 1 ] = None
603
+ tokens[n + 2 ] = None
604
+
605
+ tokens = [x for x in tokens if x is not None ]
606
+ return tokens
607
+
608
+ @classmethod
609
+ def split (cls , s ):
610
+ return cls (s).get_tokens()
611
+
612
+
613
+ _DATEUTIL_LEXER_SPLIT = _timelex.split
549
614
550
615
551
616
def _format_is_iso (f ) -> bint:
0 commit comments