Skip to content

Commit d6e4337

Browse files
committed
BUG: fix GH8989 to parse nanoseconds with %f format
1 parent ea2489d commit d6e4337

File tree

5 files changed

+36
-13
lines changed

5 files changed

+36
-13
lines changed

doc/source/whatsnew/v0.15.2.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,15 @@ Enhancements
7878
- Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`).
7979
- ``Series`` now works with map objects the same way as generators (:issue:`8909`).
8080
- Added context manager to ``HDFStore`` for automatic closing (:issue:`8791`).
81+
- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default) (:issue:`8904`)
8182

8283
.. _whatsnew_0152.performance:
8384

8485
Performance
8586
~~~~~~~~~~~
8687
- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)
8788

88-
- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`)
89+
- Performance boost for ``to_datetime`` conversions with a passed ``format=``, and the ``exact=False`` (:issue:`8904`)
8990

9091
.. _whatsnew_0152.experimental:
9192

@@ -143,6 +144,7 @@ Bug Fixes
143144

144145
- Report a ``TypeError`` when invalid/no paramaters are passed in a groupby (:issue:`8015`)
145146
- Regression in DatetimeIndex iteration with a Fixed/Local offset timezone (:issue:`8890`)
147+
- Bug in ``to_datetime`` when parsing a nanoseconds using the ``%f`` format (:issue:`8989`)
146148
- Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
147149
- Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
148150
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).

pandas/tseries/tests/test_timeseries.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -4125,14 +4125,30 @@ def test_to_datetime_format_time(self):
41254125

41264126
def test_to_datetime_with_non_exact(self):
41274127

4128+
# 8904
4129+
# exact kw
41284130
if sys.version_info < (2, 7):
41294131
raise nose.SkipTest('on python version < 2.7')
41304132

4131-
s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000)
4133+
s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z'])
41324134
result = to_datetime(s,format='%d%b%y',exact=False)
41334135
expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
41344136
assert_series_equal(result, expected)
41354137

4138+
def test_parse_nanoseconds_with_formula(self):
4139+
4140+
# GH8989
4141+
# trunctaing the nanoseconds when a format was provided
4142+
for v in ["2012-01-01 09:00:00.000000001",
4143+
"2012-01-01 09:00:00.000001",
4144+
"2012-01-01 09:00:00.001",
4145+
"2012-01-01 09:00:00.001000",
4146+
"2012-01-01 09:00:00.001000000",
4147+
]:
4148+
expected = pd.to_datetime(v)
4149+
result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f")
4150+
self.assertEqual(result,expected)
4151+
41364152
def test_to_datetime_format_weeks(self):
41374153
data = [
41384154
['2009324', '%Y%W%w', Timestamp('2009-08-13')],

pandas/tseries/tools.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,11 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
194194
box : boolean, default True
195195
If True returns a DatetimeIndex, if False returns ndarray of values
196196
format : string, default None
197-
strftime to parse time, eg "%d/%m/%Y"
197+
strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
198+
all the way up to nanoseconds
198199
exact : boolean, True by default
199-
if True, require an exact format match
200-
if False, search for a matching format non-exclusive to the endpoints
200+
If True, require an exact format match.
201+
If False, allow the format to match anywhere in the target string.
201202
coerce : force errors to NaT (False by default)
202203
unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
203204
(e.g. a unix timestamp), which is an integer/float number

pandas/tslib.pyx

+11-7
Original file line numberDiff line numberDiff line change
@@ -2137,8 +2137,9 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
21372137
Py_ssize_t i, n = len(values)
21382138
pandas_datetimestruct dts
21392139
ndarray[int64_t] iresult
2140-
int year, month, day, minute, hour, second, fraction, weekday, julian, tz
2140+
int year, month, day, minute, hour, second, weekday, julian, tz
21412141
int week_of_year, week_of_year_start
2142+
int64_t us, ns
21422143
object val, group_key, ampm, found
21432144
dict found_key
21442145

@@ -2237,7 +2238,7 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
22372238

22382239
year = 1900
22392240
month = day = 1
2240-
hour = minute = second = fraction = 0
2241+
hour = minute = second = ns = us = 0
22412242
tz = -1
22422243
# Default to -1 to signify that values not known; not critical to have,
22432244
# though
@@ -2302,9 +2303,11 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
23022303
second = int(found_dict['S'])
23032304
elif parse_code == 10:
23042305
s = found_dict['f']
2305-
# Pad to always return microseconds.
2306-
s += "0" * (6 - len(s))
2307-
fraction = int(s)
2306+
# Pad to always return nanoseconds
2307+
s += "0" * (9 - len(s))
2308+
us = long(s)
2309+
ns = us % 1000
2310+
us = us / 1000
23082311
elif parse_code == 11:
23092312
weekday = locale_time.f_weekday.index(found_dict['A'].lower())
23102313
elif parse_code == 12:
@@ -2369,7 +2372,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
23692372
dts.hour = hour
23702373
dts.min = minute
23712374
dts.sec = second
2372-
dts.us = fraction
2375+
dts.us = us
2376+
dts.ps = ns * 1000
23732377

23742378
iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
23752379
try:
@@ -4311,7 +4315,7 @@ class TimeRE(dict):
43114315
base.__init__({
43124316
# The " \d" part of the regex is to make %c from ANSI C work
43134317
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
4314-
'f': r"(?P<f>[0-9]{1,6})",
4318+
'f': r"(?P<f>[0-9]{1,9})",
43154319
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
43164320
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
43174321
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",

vb_suite/timeseries.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def date_range(start=None, end=None, periods=None, freq=None):
159159
setup = common_setup + """
160160
s = Series(['19MAY11','19MAY11:00:00:00']*100000)
161161
"""
162-
timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
162+
timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
163163
setup, start_date=datetime(2014, 11, 26))
164164
timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
165165
setup, start_date=datetime(2014, 11, 26))

0 commit comments

Comments
 (0)