From ea2489d54689b47a60110c34eebd4298297df961 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 26 Nov 2014 20:38:15 -0500
Subject: [PATCH 1/2] PERF: add exact kw to to_datetime to enable faster regex
 format parsing for datetimes (GH8904)

---
 doc/source/whatsnew/v0.15.2.txt         |  2 +
 pandas/tseries/tests/test_timeseries.py | 10 ++++
 pandas/tseries/tools.py                 |  7 ++-
 pandas/tslib.pyx                        | 62 ++++++++++++++++++-------
 vb_suite/timeseries.py                  |  8 ++++
 5 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
index 10b23605cca85..34f359fe6a54c 100644
--- a/doc/source/whatsnew/v0.15.2.txt
+++ b/doc/source/whatsnew/v0.15.2.txt
@@ -85,6 +85,8 @@ Performance
 ~~~~~~~~~~~
 - Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)
 
+- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`)
+
 .. _whatsnew_0152.experimental:
 
 Experimental
diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
index 12e10a71c67b2..ae68cdabd74c7 100644
--- a/pandas/tseries/tests/test_timeseries.py
+++ b/pandas/tseries/tests/test_timeseries.py
@@ -4123,6 +4123,16 @@ def test_to_datetime_format_time(self):
         for s, format, dt in data:
             self.assertEqual(to_datetime(s, format=format), dt)
 
+    def test_to_datetime_with_non_exact(self):
+
+        if sys.version_info < (2, 7):
+            raise nose.SkipTest('on python version < 2.7')
+
+        s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000)
+        result = to_datetime(s,format='%d%b%y',exact=False)
+        expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
+        assert_series_equal(result, expected)
+
     def test_to_datetime_format_weeks(self):
         data = [
                 ['2009324', '%Y%W%w', Timestamp('2009-08-13')],
diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py
index f29ab14ed8745..d556df9280d3f 100644
--- a/pandas/tseries/tools.py
+++ b/pandas/tseries/tools.py
@@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
-                format=None, coerce=False, unit='ns',
+                format=None, exact=True, coerce=False, unit='ns',
                 infer_datetime_format=False):
     """
     Convert argument to datetime.
@@ -195,6 +195,9 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
         If True returns a DatetimeIndex, if False returns ndarray of values
     format : string, default None
         strftime to parse time, eg "%d/%m/%Y"
+    exact : boolean, True by default
+        if True, require an exact format match
+        if False, search for a matching format non-exclusive to the endpoints
     coerce : force errors to NaT (False by default)
     unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
         (e.g. a unix timestamp), which is an integer/float number
@@ -273,7 +276,7 @@ def _convert_listlike(arg, box, format):
                 if result is None:
                     try:
                         result = tslib.array_strptime(
-                            arg, format, coerce=coerce
+                            arg, format, exact=exact, coerce=coerce
                         )
                     except (tslib.OutOfBoundsDatetime):
                         if errors == 'raise':
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index d7c5d656d71e0..d4f14992949d3 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -2123,13 +2123,24 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
         raise ValueError("Invalid type for timedelta scalar: %s" % type(ts))
     return ts.astype('timedelta64[ns]')
 
-def array_strptime(ndarray[object] values, object fmt, coerce=False):
+def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False):
+    """
+    Parameters
+    ----------
+    values : ndarray of string-like objects
+    fmt : string-like regex
+    exact : matches must be exact if True, search if False
+    coerce : if invalid values found, coerce to NaT
+    """
+
     cdef:
         Py_ssize_t i, n = len(values)
         pandas_datetimestruct dts
         ndarray[int64_t] iresult
-        int year, month, day, minute, hour, second, fraction, weekday, julian
-        object val
+        int year, month, day, minute, hour, second, fraction, weekday, julian, tz
+        int week_of_year, week_of_year_start
+        object val, group_key, ampm, found
+        dict found_key
 
     global _TimeRE_cache, _regex_cache
     with _cache_lock:
@@ -2198,19 +2209,32 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
             else:
                 val = str(val)
 
-        found = format_regex.match(val)
-        if not found:
-            if coerce:
-                iresult[i] = iNaT
-                continue
-            raise ValueError("time data %r does not match format %r" %
-                             (values[i], fmt))
-        if len(val) != found.end():
-            if coerce:
-                iresult[i] = iNaT
-                continue
-            raise ValueError("unconverted data remains: %s" %
-                              values[i][found.end():])
+        # exact matching
+        if exact:
+            found = format_regex.match(val)
+            if not found:
+                if coerce:
+                    iresult[i] = iNaT
+                    continue
+                raise ValueError("time data %r does not match format %r (match)" %
+                                 (values[i], fmt))
+            if len(val) != found.end():
+                if coerce:
+                    iresult[i] = iNaT
+                    continue
+                raise ValueError("unconverted data remains: %s" %
+                                  values[i][found.end():])
+
+        # search
+        else:
+            found = format_regex.search(val)
+            if not found:
+                if coerce:
+                    iresult[i] = iNaT
+                    continue
+                raise ValueError("time data %r does not match format %r (search)" %
+                                 (values[i], fmt))
+
         year = 1900
         month = day = 1
         hour = minute = second = fraction = 0
@@ -4368,10 +4392,14 @@ _TimeRE_cache = TimeRE()
 _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
 _regex_cache = {}
 
-def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
+cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon):
     """Calculate the Julian day based on the year, week of the year, and day of
     the week, with week_start_day representing whether the week of the year
     assumes the week starts on Sunday or Monday (6 or 0)."""
+
+    cdef:
+        int first_weekday,  week_0_length, days_to_week
+
     first_weekday = datetime_date(year, 1, 1).weekday()
     # If we are dealing with the %U directive (week starts on Sunday), it's
     # easier to just shift the view to Sunday being the first day of the
diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py
index c67cdabdc1a06..1bcd2de5dbefe 100644
--- a/vb_suite/timeseries.py
+++ b/vb_suite/timeseries.py
@@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None):
     Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
               start_date=datetime(2012, 7, 1))
 
+setup = common_setup + """
+s = Series(['19MAY11','19MAY11:00:00:00']*100000)
+"""
+timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
+     setup, start_date=datetime(2014, 11, 26))
+timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
+     setup, start_date=datetime(2014, 11, 26))
+
 # ---- infer_freq
 # infer_freq
 

From d6e43373db5ae0201f84254fe4d7ab7684835cb6 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 3 Dec 2014 18:53:03 -0500
Subject: [PATCH 2/2] BUG: fix GH8989 to parse nanoseconds with %f format

---
 doc/source/whatsnew/v0.15.2.txt         |  4 +++-
 pandas/tseries/tests/test_timeseries.py | 18 +++++++++++++++++-
 pandas/tseries/tools.py                 |  7 ++++---
 pandas/tslib.pyx                        | 18 +++++++++++-------
 vb_suite/timeseries.py                  |  2 +-
 5 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
index 34f359fe6a54c..7e8a4c7ba4faf 100644
--- a/doc/source/whatsnew/v0.15.2.txt
+++ b/doc/source/whatsnew/v0.15.2.txt
@@ -78,6 +78,7 @@ Enhancements
 - Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`).
 - ``Series`` now works with map objects the same way as generators (:issue:`8909`).
 - Added context manager to ``HDFStore`` for automatic closing (:issue:`8791`).
+- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default)  (:issue:`8904`)
 
 .. _whatsnew_0152.performance:
 
@@ -85,7 +86,7 @@ Performance
 ~~~~~~~~~~~
 - Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)
 
-- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`)
+- Performance boost for ``to_datetime`` conversions with a passed ``format=``, and the ``exact=False`` (:issue:`8904`)
 
 .. _whatsnew_0152.experimental:
 
@@ -143,6 +144,7 @@ Bug Fixes
 
 - Report a ``TypeError`` when invalid/no paramaters are passed in a groupby (:issue:`8015`)
 - Regression in DatetimeIndex iteration with a Fixed/Local offset timezone (:issue:`8890`)
+- Bug in ``to_datetime`` when parsing a nanoseconds using the ``%f`` format (:issue:`8989`)
 - Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
 - Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
 - ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).
diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
index ae68cdabd74c7..7a428fd629125 100644
--- a/pandas/tseries/tests/test_timeseries.py
+++ b/pandas/tseries/tests/test_timeseries.py
@@ -4125,14 +4125,30 @@ def test_to_datetime_format_time(self):
 
     def test_to_datetime_with_non_exact(self):
 
+        # 8904
+        # exact kw
         if sys.version_info < (2, 7):
             raise nose.SkipTest('on python version < 2.7')
 
-        s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000)
+        s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z'])
         result = to_datetime(s,format='%d%b%y',exact=False)
         expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
         assert_series_equal(result, expected)
 
+    def test_parse_nanoseconds_with_formula(self):
+
+        # GH8989
+        # trunctaing the nanoseconds when a format was provided
+        for v in ["2012-01-01 09:00:00.000000001",
+                  "2012-01-01 09:00:00.000001",
+                  "2012-01-01 09:00:00.001",
+                  "2012-01-01 09:00:00.001000",
+                  "2012-01-01 09:00:00.001000000",
+                  ]:
+            expected = pd.to_datetime(v)
+            result =  pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f")
+            self.assertEqual(result,expected)
+
     def test_to_datetime_format_weeks(self):
         data = [
                 ['2009324', '%Y%W%w', Timestamp('2009-08-13')],
diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py
index d556df9280d3f..e680fa06a9c8e 100644
--- a/pandas/tseries/tools.py
+++ b/pandas/tseries/tools.py
@@ -194,10 +194,11 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
     box : boolean, default True
         If True returns a DatetimeIndex, if False returns ndarray of values
     format : string, default None
-        strftime to parse time, eg "%d/%m/%Y"
+        strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
+        all the way up to nanoseconds
     exact : boolean, True by default
-        if True, require an exact format match
-        if False, search for a matching format non-exclusive to the endpoints
+        If True, require an exact format match.
+        If False, allow the format to match anywhere in the target string.
     coerce : force errors to NaT (False by default)
     unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
         (e.g. a unix timestamp), which is an integer/float number
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index d4f14992949d3..4cb6c93bdf3d0 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -2137,8 +2137,9 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
         Py_ssize_t i, n = len(values)
         pandas_datetimestruct dts
         ndarray[int64_t] iresult
-        int year, month, day, minute, hour, second, fraction, weekday, julian, tz
+        int year, month, day, minute, hour, second, weekday, julian, tz
         int week_of_year, week_of_year_start
+        int64_t us, ns
         object val, group_key, ampm, found
         dict found_key
 
@@ -2237,7 +2238,7 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
 
         year = 1900
         month = day = 1
-        hour = minute = second = fraction = 0
+        hour = minute = second = ns = us = 0
         tz = -1
         # Default to -1 to signify that values not known; not critical to have,
         # though
@@ -2302,9 +2303,11 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
                 second = int(found_dict['S'])
             elif parse_code == 10:
                 s = found_dict['f']
-                # Pad to always return microseconds.
-                s += "0" * (6 - len(s))
-                fraction = int(s)
+                # Pad to always return nanoseconds
+                s += "0" * (9 - len(s))
+                us = long(s)
+                ns = us % 1000
+                us = us / 1000
             elif parse_code == 11:
                 weekday = locale_time.f_weekday.index(found_dict['A'].lower())
             elif parse_code == 12:
@@ -2369,7 +2372,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
         dts.hour = hour
         dts.min = minute
         dts.sec = second
-        dts.us = fraction
+        dts.us = us
+        dts.ps = ns * 1000
 
         iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
         try:
@@ -4311,7 +4315,7 @@ class TimeRE(dict):
         base.__init__({
             # The " \d" part of the regex is to make %c from ANSI C work
             'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
-            'f': r"(?P<f>[0-9]{1,6})",
+            'f': r"(?P<f>[0-9]{1,9})",
             'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
             'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
             'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py
index 1bcd2de5dbefe..f0c3961ae0277 100644
--- a/vb_suite/timeseries.py
+++ b/vb_suite/timeseries.py
@@ -159,7 +159,7 @@ def date_range(start=None, end=None, periods=None, freq=None):
 setup = common_setup + """
 s = Series(['19MAY11','19MAY11:00:00:00']*100000)
 """
-timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
+timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
      setup, start_date=datetime(2014, 11, 26))
 timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
      setup, start_date=datetime(2014, 11, 26))