Skip to content

Commit 4c4d152

Browse files
committed
2 parents 63f54e5 + 6f7f5f8 commit 4c4d152

File tree

5 files changed

+100
-26
lines changed

5 files changed

+100
-26
lines changed

doc/source/whatsnew/v0.15.2.txt

+4
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,16 @@ Enhancements
7979
- Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`).
8080
- ``Series`` now works with map objects the same way as generators (:issue:`8909`).
8181
- Added context manager to ``HDFStore`` for automatic closing (:issue:`8791`).
82+
- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default) (:issue:`8904`)
8283

8384
.. _whatsnew_0152.performance:
8485

8586
Performance
8687
~~~~~~~~~~~
8788
- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)
8889

90+
- Performance boost for ``to_datetime`` conversions with a passed ``format=``, and the ``exact=False`` (:issue:`8904`)
91+
8992
.. _whatsnew_0152.experimental:
9093

9194
Experimental
@@ -142,6 +145,7 @@ Bug Fixes
142145

143146
- Report a ``TypeError`` when invalid/no paramaters are passed in a groupby (:issue:`8015`)
144147
- Regression in DatetimeIndex iteration with a Fixed/Local offset timezone (:issue:`8890`)
148+
- Bug in ``to_datetime`` when parsing a nanoseconds using the ``%f`` format (:issue:`8989`)
145149
- Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
146150
- Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
147151
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).

pandas/tseries/tests/test_timeseries.py

+26
Original file line numberDiff line numberDiff line change
@@ -4123,6 +4123,32 @@ def test_to_datetime_format_time(self):
41234123
for s, format, dt in data:
41244124
self.assertEqual(to_datetime(s, format=format), dt)
41254125

4126+
def test_to_datetime_with_non_exact(self):
4127+
4128+
# 8904
4129+
# exact kw
4130+
if sys.version_info < (2, 7):
4131+
raise nose.SkipTest('on python version < 2.7')
4132+
4133+
s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z'])
4134+
result = to_datetime(s,format='%d%b%y',exact=False)
4135+
expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
4136+
assert_series_equal(result, expected)
4137+
4138+
def test_parse_nanoseconds_with_formula(self):
4139+
4140+
# GH8989
4141+
# trunctaing the nanoseconds when a format was provided
4142+
for v in ["2012-01-01 09:00:00.000000001",
4143+
"2012-01-01 09:00:00.000001",
4144+
"2012-01-01 09:00:00.001",
4145+
"2012-01-01 09:00:00.001000",
4146+
"2012-01-01 09:00:00.001000000",
4147+
]:
4148+
expected = pd.to_datetime(v)
4149+
result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f")
4150+
self.assertEqual(result,expected)
4151+
41264152
def test_to_datetime_format_weeks(self):
41274153
data = [
41284154
['2009324', '%Y%W%w', Timestamp('2009-08-13')],

pandas/tseries/tools.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
174174
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
175175

176176
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
177-
format=None, coerce=False, unit='ns',
177+
format=None, exact=True, coerce=False, unit='ns',
178178
infer_datetime_format=False):
179179
"""
180180
Convert argument to datetime.
@@ -194,7 +194,11 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
194194
box : boolean, default True
195195
If True returns a DatetimeIndex, if False returns ndarray of values
196196
format : string, default None
197-
strftime to parse time, eg "%d/%m/%Y"
197+
strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
198+
all the way up to nanoseconds
199+
exact : boolean, True by default
200+
If True, require an exact format match.
201+
If False, allow the format to match anywhere in the target string.
198202
coerce : force errors to NaT (False by default)
199203
unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
200204
(e.g. a unix timestamp), which is an integer/float number
@@ -273,7 +277,7 @@ def _convert_listlike(arg, box, format):
273277
if result is None:
274278
try:
275279
result = tslib.array_strptime(
276-
arg, format, coerce=coerce
280+
arg, format, exact=exact, coerce=coerce
277281
)
278282
except (tslib.OutOfBoundsDatetime):
279283
if errors == 'raise':

pandas/tslib.pyx

+55-23
Original file line numberDiff line numberDiff line change
@@ -2123,13 +2123,25 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
21232123
raise ValueError("Invalid type for timedelta scalar: %s" % type(ts))
21242124
return ts.astype('timedelta64[ns]')
21252125

2126-
def array_strptime(ndarray[object] values, object fmt, coerce=False):
2126+
def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False):
2127+
"""
2128+
Parameters
2129+
----------
2130+
values : ndarray of string-like objects
2131+
fmt : string-like regex
2132+
exact : matches must be exact if True, search if False
2133+
coerce : if invalid values found, coerce to NaT
2134+
"""
2135+
21272136
cdef:
21282137
Py_ssize_t i, n = len(values)
21292138
pandas_datetimestruct dts
21302139
ndarray[int64_t] iresult
2131-
int year, month, day, minute, hour, second, fraction, weekday, julian
2132-
object val
2140+
int year, month, day, minute, hour, second, weekday, julian, tz
2141+
int week_of_year, week_of_year_start
2142+
int64_t us, ns
2143+
object val, group_key, ampm, found
2144+
dict found_key
21332145

21342146
global _TimeRE_cache, _regex_cache
21352147
with _cache_lock:
@@ -2198,22 +2210,35 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
21982210
else:
21992211
val = str(val)
22002212

2201-
found = format_regex.match(val)
2202-
if not found:
2203-
if coerce:
2204-
iresult[i] = iNaT
2205-
continue
2206-
raise ValueError("time data %r does not match format %r" %
2207-
(values[i], fmt))
2208-
if len(val) != found.end():
2209-
if coerce:
2210-
iresult[i] = iNaT
2211-
continue
2212-
raise ValueError("unconverted data remains: %s" %
2213-
values[i][found.end():])
2213+
# exact matching
2214+
if exact:
2215+
found = format_regex.match(val)
2216+
if not found:
2217+
if coerce:
2218+
iresult[i] = iNaT
2219+
continue
2220+
raise ValueError("time data %r does not match format %r (match)" %
2221+
(values[i], fmt))
2222+
if len(val) != found.end():
2223+
if coerce:
2224+
iresult[i] = iNaT
2225+
continue
2226+
raise ValueError("unconverted data remains: %s" %
2227+
values[i][found.end():])
2228+
2229+
# search
2230+
else:
2231+
found = format_regex.search(val)
2232+
if not found:
2233+
if coerce:
2234+
iresult[i] = iNaT
2235+
continue
2236+
raise ValueError("time data %r does not match format %r (search)" %
2237+
(values[i], fmt))
2238+
22142239
year = 1900
22152240
month = day = 1
2216-
hour = minute = second = fraction = 0
2241+
hour = minute = second = ns = us = 0
22172242
tz = -1
22182243
# Default to -1 to signify that values not known; not critical to have,
22192244
# though
@@ -2278,9 +2303,11 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
22782303
second = int(found_dict['S'])
22792304
elif parse_code == 10:
22802305
s = found_dict['f']
2281-
# Pad to always return microseconds.
2282-
s += "0" * (6 - len(s))
2283-
fraction = int(s)
2306+
# Pad to always return nanoseconds
2307+
s += "0" * (9 - len(s))
2308+
us = long(s)
2309+
ns = us % 1000
2310+
us = us / 1000
22842311
elif parse_code == 11:
22852312
weekday = locale_time.f_weekday.index(found_dict['A'].lower())
22862313
elif parse_code == 12:
@@ -2345,7 +2372,8 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
23452372
dts.hour = hour
23462373
dts.min = minute
23472374
dts.sec = second
2348-
dts.us = fraction
2375+
dts.us = us
2376+
dts.ps = ns * 1000
23492377

23502378
iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
23512379
try:
@@ -4287,7 +4315,7 @@ class TimeRE(dict):
42874315
base.__init__({
42884316
# The " \d" part of the regex is to make %c from ANSI C work
42894317
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
4290-
'f': r"(?P<f>[0-9]{1,6})",
4318+
'f': r"(?P<f>[0-9]{1,9})",
42914319
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
42924320
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
42934321
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
@@ -4368,10 +4396,14 @@ _TimeRE_cache = TimeRE()
43684396
_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
43694397
_regex_cache = {}
43704398

4371-
def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
4399+
cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon):
43724400
"""Calculate the Julian day based on the year, week of the year, and day of
43734401
the week, with week_start_day representing whether the week of the year
43744402
assumes the week starts on Sunday or Monday (6 or 0)."""
4403+
4404+
cdef:
4405+
int first_weekday, week_0_length, days_to_week
4406+
43754407
first_weekday = datetime_date(year, 1, 1).weekday()
43764408
# If we are dealing with the %U directive (week starts on Sunday), it's
43774409
# easier to just shift the view to Sunday being the first day of the

vb_suite/timeseries.py

+8
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None):
156156
Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
157157
start_date=datetime(2012, 7, 1))
158158

159+
setup = common_setup + """
160+
s = Series(['19MAY11','19MAY11:00:00:00']*100000)
161+
"""
162+
timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
163+
setup, start_date=datetime(2014, 11, 26))
164+
timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
165+
setup, start_date=datetime(2014, 11, 26))
166+
159167
# ---- infer_freq
160168
# infer_freq
161169

0 commit comments

Comments
 (0)