@@ -11,6 +11,9 @@ from cpython.datetime cimport datetime
11
11
12
12
import numpy as np
13
13
14
+ import six
15
+ from six import binary_type, text_type
16
+
14
17
# Avoid import from outside _libs
15
18
if sys.version_info.major == 2 :
16
19
from StringIO import StringIO
@@ -531,21 +534,83 @@ def try_parse_datetime_components(object[:] years,
531
534
# ----------------------------------------------------------------------
532
535
# Miscellaneous
533
536
534
- _DATEUTIL_LEXER_SPLIT = None
535
- try :
536
- # Since these are private methods from dateutil, it is safely imported
537
- # here so in case this interface changes, pandas will just fallback
538
- # to not using the functionality
539
- from dateutil.parser import _timelex
540
-
541
- if hasattr (_timelex, ' split' ):
542
- def _lexer_split_from_str (dt_str ):
543
- # The StringIO(str(_)) is for dateutil 2.2 compatibility
544
- return _timelex.split(StringIO(str (dt_str)))
545
537
546
- _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
547
- except (ImportError , AttributeError ):
548
- pass
538
+ # Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
539
+ #
540
+ # Copyright (c) 2017 - dateutil contributors
541
+ class _timelex (object ):
542
+ def __init__ (self , instream ):
543
+ if six.PY2:
544
+ # In Python 2, we can't duck type properly because unicode has
545
+ # a 'decode' function, and we'd be double-decoding
546
+ if isinstance (instream, (binary_type, bytearray)):
547
+ instream = instream.decode()
548
+ else :
549
+ if getattr (instream, ' decode' , None ) is not None :
550
+ instream = instream.decode()
551
+
552
+ if isinstance (instream, text_type):
553
+ self .stream = instream
554
+ elif getattr (instream, ' read' , None ) is None :
555
+ raise TypeError (
556
+ ' Parser must be a string or character stream, not '
557
+ ' {itype}' .format(itype = instream.__class__ .__name__ ))
558
+ else :
559
+ self .stream = instream.read()
560
+
561
+ def get_tokens (self ):
562
+ """
563
+ This function breaks the time string into lexical units (tokens), which
564
+ can be parsed by the parser. Lexical units are demarcated by changes in
565
+ the character set, so any continuous string of letters is considered
566
+ one unit, any continuous string of numbers is considered one unit.
567
+ The main complication arises from the fact that dots ('.') can be used
568
+ both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
569
+ "4:30:21.447"). As such, it is necessary to read the full context of
570
+ any dot-separated strings before breaking it into tokens; as such, this
571
+ function maintains a "token stack", for when the ambiguous context
572
+ demands that multiple tokens be parsed at once.
573
+ """
574
+ stream = self .stream.replace(' \x00 ' , ' ' )
575
+
576
+ # TODO: Change \s --> \s+ (this doesn't match existing behavior)
577
+ # TODO: change the punctuation block to punc+ (doesnt match existing)
578
+ # TODO: can we merge the two digit patterns?
579
+ tokens = re.findall(' \s|'
580
+ ' (?<![\.\d])\d+\.\d+(?![\.\d])'
581
+ ' |\d+'
582
+ ' |[a-zA-Z]+'
583
+ ' |[\./:]+'
584
+ ' |[^\da-zA-Z\./:\s]+' , stream)
585
+
586
+ # Re-combine token tuples of the form ["59", ",", "456"] because
587
+ # in this context the "," is treated as a decimal
588
+ # (e.g. in python's default logging format)
589
+ for n, token in enumerate (tokens[:- 2 ]):
590
+ # Kludge to match ,-decimal behavior; it'd be better to do this
591
+ # later in the process and have a simpler tokenization
592
+ if (token is not None and token.isdigit() and
593
+ tokens[n + 1 ] == ' ,' and tokens[n + 2 ].isdigit()):
594
+ # Have to check None b/c it might be replaced during the loop
595
+ # TODO: I _really_ don't faking the value here
596
+ tokens[n] = token + ' .' + tokens[n + 2 ]
597
+ tokens[n + 1 ] = None
598
+ tokens[n + 2 ] = None
599
+
600
+ tokens = [x for x in tokens if x is not None ]
601
+ return tokens
602
+
603
+ @classmethod
604
+ def split (cls , s ):
605
+ return cls (s).get_tokens()
606
+
607
+
608
+ def _lexer_split_from_str (dt_str ):
609
+ # The StringIO(str(_)) is for dateutil 2.2 compatibility
610
+ return _timelex.split(StringIO(str (dt_str)))
611
+
612
+
613
+ _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
549
614
550
615
551
616
def _format_is_iso (f ) -> bint:
0 commit comments