@@ -11,6 +11,9 @@ from cpython.datetime cimport datetime
11
11
12
12
import numpy as np
13
13
14
+ import six
15
+ from six import binary_type, text_type
16
+
14
17
# Avoid import from outside _libs
15
18
if sys.version_info.major == 2 :
16
19
from StringIO import StringIO
@@ -531,21 +534,84 @@ def try_parse_datetime_components(object[:] years,
531
534
# ----------------------------------------------------------------------
532
535
# Miscellaneous
533
536
534
- _DATEUTIL_LEXER_SPLIT = None
535
- try :
536
- # Since these are private methods from dateutil, it is safely imported
537
- # here so in case this interface changes, pandas will just fallback
538
- # to not using the functionality
539
- from dateutil.parser import _timelex
540
-
541
- if hasattr (_timelex, ' split' ):
542
- def _lexer_split_from_str (dt_str ):
543
- # The StringIO(str(_)) is for dateutil 2.2 compatibility
544
- return _timelex.split(StringIO(str (dt_str)))
545
537
546
- _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
547
- except (ImportError , AttributeError ):
548
- pass
538
+ # Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
539
+ #
540
+ # Copyright (c) 2017 - Paul Ganssle <[email protected] >
541
+ # Copyright (c) 2017 - dateutil contributors
542
+ class _timelex (object ):
543
+ def __init__ (self , instream ):
544
+ if six.PY2:
545
+ # In Python 2, we can't duck type properly because unicode has
546
+ # a 'decode' function, and we'd be double-decoding
547
+ if isinstance (instream, (binary_type, bytearray)):
548
+ instream = instream.decode()
549
+ else :
550
+ if getattr (instream, ' decode' , None ) is not None :
551
+ instream = instream.decode()
552
+
553
+ if isinstance (instream, text_type):
554
+ self .stream = instream
555
+ elif getattr (instream, ' read' , None ) is None :
556
+ raise TypeError (
557
+ ' Parser must be a string or character stream, not '
558
+ ' {itype}' .format(itype = instream.__class__ .__name__ ))
559
+ else :
560
+ self .stream = instream.read()
561
+
562
+ def get_tokens (self ):
563
+ """
564
+ This function breaks the time string into lexical units (tokens), which
565
+ can be parsed by the parser. Lexical units are demarcated by changes in
566
+ the character set, so any continuous string of letters is considered
567
+ one unit, any continuous string of numbers is considered one unit.
568
+ The main complication arises from the fact that dots ('.') can be used
569
+ both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
570
+ "4:30:21.447"). As such, it is necessary to read the full context of
571
+ any dot-separated strings before breaking it into tokens; as such, this
572
+ function maintains a "token stack", for when the ambiguous context
573
+ demands that multiple tokens be parsed at once.
574
+ """
575
+ stream = self .stream.replace(' \x00 ' , ' ' )
576
+
577
+ # TODO: Change \s --> \s+ (this doesn't match existing behavior)
578
+ # TODO: change the punctuation block to punc+ (doesnt match existing)
579
+ # TODO: can we merge the two digit patterns?
580
+ tokens = re.findall(' \s|'
581
+ ' (?<![\.\d])\d+\.\d+(?![\.\d])'
582
+ ' |\d+'
583
+ ' |[a-zA-Z]+'
584
+ ' |[\./:]+'
585
+ ' |[^\da-zA-Z\./:\s]+' , stream)
586
+
587
+ # Re-combine token tuples of the form ["59", ",", "456"] because
588
+ # in this context the "," is treated as a decimal
589
+ # (e.g. in python's default logging format)
590
+ for n, token in enumerate (tokens[:- 2 ]):
591
+ # Kludge to match ,-decimal behavior; it'd be better to do this
592
+ # later in the process and have a simpler tokenization
593
+ if (token is not None and token.isdigit() and
594
+ tokens[n + 1 ] == ' ,' and tokens[n + 2 ].isdigit()):
595
+ # Have to check None b/c it might be replaced during the loop
596
+ # TODO: I _really_ don't faking the value here
597
+ tokens[n] = token + ' .' + tokens[n + 2 ]
598
+ tokens[n + 1 ] = None
599
+ tokens[n + 2 ] = None
600
+
601
+ tokens = [x for x in tokens if x is not None ]
602
+ return tokens
603
+
604
+ @classmethod
605
+ def split (cls , s ):
606
+ return cls (s).get_tokens()
607
+
608
+
609
+ def _lexer_split_from_str (dt_str ):
610
+ # The StringIO(str(_)) is for dateutil 2.2 compatibility
611
+ return _timelex.split(StringIO(str (dt_str)))
612
+
613
+
614
+ _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
549
615
550
616
551
617
def _format_is_iso (f ) -> bint:
0 commit comments