diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 2c97cae80ae2a..528dd2e05031d 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -266,6 +266,19 @@ other anchored offsets like ``MonthBegin`` and ``YearBegin``. Other API Changes ^^^^^^^^^^^^^^^^^ +- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing +of date strings is no longer supported and raises a ValueError. (:issue:`11818`) + +.. code-block:: python + + In [3]: s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) + + In [4]: s.between_time("7:00am", "9:00am") + Out[4]: + 2015-01-01 07:00:00 7 + 2015-01-01 08:00:00 8 + 2015-01-01 09:00:00 9 + Freq: H, dtype: int64 diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 091c6245e346d..254704f21387c 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -21,7 +21,7 @@ Resolution) from pandas.tseries.base import DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin from pandas.tseries.offsets import DateOffset, generate_range, Tick, CDay -from pandas.tseries.tools import parse_time_string, normalize_date +from pandas.tseries.tools import parse_time_string, normalize_date, to_time from pandas.tseries.timedeltas import to_timedelta from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.core.common as com @@ -109,12 +109,12 @@ def _ensure_datetime64(other): return other raise TypeError('%s type object %s' % (type(other), str(other))) - _midnight = time(0, 0) + def _new_DatetimeIndex(cls, d): - """ This is called upon unpickling, rather than the default which doesn't have arguments - and breaks __new__ """ + """ This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__ """ # data are already in UTC # so need to localize @@ -1755,12 +1755,18 @@ def indexer_at_time(self, time, asof=False): def indexer_between_time(self, start_time, end_time, include_start=True, include_end=True): """ - Select values between particular times of day (e.g., 9:00-9:30AM) + Select values between particular times of day (e.g., 9:00-9:30AM). + + Return values of the index between two times. If start_time or + end_time are strings then tseres.tools.to_time is used to convert to + a time object. Parameters ---------- - start_time : datetime.time or string - end_time : datetime.time or string + start_time, end_time : datetime.time, str + datetime.time or string in appropriate format ("%H:%M", "%H%M", + "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", + "%I%M%S%p") include_start : boolean, default True include_end : boolean, default True tz : string or pytz.timezone or dateutil.tz.tzfile, default None @@ -1769,18 +1775,8 @@ def indexer_between_time(self, start_time, end_time, include_start=True, ------- values_between_time : TimeSeries """ - from dateutil.parser import parse - - if isinstance(start_time, compat.string_types): - start_time = parse(start_time).time() - - if isinstance(end_time, compat.string_types): - end_time = parse(end_time).time() - - if start_time.tzinfo or end_time.tzinfo: - raise NotImplementedError("argument 'time' with timezone info is " - "not supported") - + start_time = to_time(start_time) + end_time = to_time(end_time) time_micros = self._get_time_micros() start_micros = _time_to_micros(start_time) end_micros = _time_to_micros(end_time) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index cf970807999e0..96aeb8b37073f 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -5,29 +5,27 @@ import operator import warnings import nose - import numpy as np -randn = np.random.randn - +import pandas.tseries.frequencies as frequencies +import pandas.lib as lib +import pandas.tslib as tslib +import pandas.index as _index +import pandas as pd from pandas import (Index, Series, DataFrame, isnull, date_range, Timestamp, Period, DatetimeIndex, Int64Index, to_datetime, bdate_range, Float64Index, - TimedeltaIndex, NaT, timedelta_range, Timedelta) + NaT, timedelta_range, Timedelta) import pandas.core.datetools as datetools import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools -import pandas.tseries.frequencies as frequencies -import pandas as pd -from pandas.util.testing import assert_series_equal, assert_almost_equal -import pandas.util.testing as tm -from pandas.tslib import NaT, iNaT -import pandas.lib as lib -import pandas.tslib as tslib +from pandas.util.testing import assert_series_equal, assert_almost_equal,\ + _skip_if_has_locale +import pandas.util.testing as tm -import pandas.index as _index +from pandas.tslib import iNaT from pandas.compat import range, long, StringIO, lrange, lmap, zip, product from numpy.random import rand @@ -40,12 +38,7 @@ from numpy.testing.decorators import slow - -def _skip_if_has_locale(): - import locale - lang, _ = locale.getlocale() - if lang is not None: - raise nose.SkipTest("Specific locale is set {0}".format(lang)) +randn = np.random.randn class TestTimeSeriesDuplicates(tm.TestCase): @@ -93,7 +86,8 @@ def test_index_unique(self): self.assertEqual(idx.nunique(), 20) self.assertEqual(idx.nunique(dropna=False), 21) - arr = [ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT] + arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for + t in range(20) ] + [NaT] idx = DatetimeIndex(arr * 3) self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) self.assertEqual(idx.nunique(), 20) @@ -258,23 +252,29 @@ def test_indexing(self): assert_series_equal(expected, result) # GH3546 (not including times on the last day) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', freq='H') + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', + freq='H') ts = Series(lrange(len(idx)), index=idx) expected = ts['2013-05'] assert_series_equal(expected, ts) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', freq='S') + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', + freq='S') ts = Series(lrange(len(idx)), index=idx) expected = ts['2013-05'] assert_series_equal(expected,ts) - idx = [ Timestamp('2013-05-31 00:00'), Timestamp(datetime(2013,5,31,23,59,59,999999))] - ts = Series(lrange(len(idx)), index=idx) + idx = [Timestamp('2013-05-31 00:00'), + Timestamp(datetime(2013,5,31,23,59,59,999999))] + ts = Series(lrange(len(idx)), index=idx) expected = ts['2013'] assert_series_equal(expected,ts) # GH 3925, indexing with a seconds resolution string / datetime object - df = DataFrame(randn(5,5),columns=['open','high','low','close','volume'],index=date_range('2012-01-02 18:01:00',periods=5,tz='US/Central',freq='s')) + df = DataFrame(randn(5,5), + columns=['open', 'high', 'low', 'close', 'volume'], + index=date_range('2012-01-02 18:01:00', + periods=5, tz='US/Central', freq='s')) expected = df.loc[[df.index[2]]] result = df['2012-01-02 18:01:02'] assert_frame_equal(result,expected) @@ -283,14 +283,16 @@ def test_indexing(self): self.assertRaises(KeyError, df.__getitem__, df.index[2],) def test_recreate_from_data(self): - freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', 'S', 'L', 'U', 'H', 'N', 'C'] + freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', + 'S', 'L', 'U', 'H', 'N', 'C'] for f in freqs: org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) idx = DatetimeIndex(org, freq=f) self.assertTrue(idx.equals(org)) - org = DatetimeIndex(start='2001/02/01 09:00', freq=f, tz='US/Pacific', periods=1) + org = DatetimeIndex(start='2001/02/01 09:00', freq=f, + tz='US/Pacific', periods=1) idx = DatetimeIndex(org, freq=f, tz='US/Pacific') self.assertTrue(idx.equals(org)) @@ -459,7 +461,8 @@ def _check_rng(rng): self.assertEqual(x.tzinfo, stamp.tzinfo) rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', tz=pytz.timezone('US/Eastern')) + rng_eastern = date_range('20090415', '20090519', + tz=pytz.timezone('US/Eastern')) rng_utc = date_range('20090415', '20090519', tz=pytz.utc) _check_rng(rng) @@ -479,7 +482,8 @@ def _check_rng(rng): self.assertEqual(x.tzinfo, stamp.tzinfo) rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', tz='dateutil/US/Eastern') + rng_eastern = date_range('20090415', '20090519', + tz='dateutil/US/Eastern') rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) _check_rng(rng) @@ -1524,6 +1528,38 @@ def test_between_time_frame(self): else: self.assertTrue((t < etime) or (t >= stime)) + def test_between_time_types(self): + # GH11818 + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + self.assertRaises(ValueError, rng.indexer_between_time, + datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + frame = DataFrame({'A': 0}, index=rng) + self.assertRaises(ValueError, frame.between_time, + datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + series = Series(0, index=rng) + self.assertRaises(ValueError, series.between_time, + datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + def test_between_time_formats(self): + # GH11818 + _skip_if_has_locale() + + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + + strings = [("2:00", "2:30"), ("0200", "0230"), + ("2:00am", "2:30am"), ("0200am", "0230am"), + ("2:00:00", "2:30:00"), ("020000", "023000"), + ("2:00:00am", "2:30:00am"), ("020000am", "023000am")] + expected_length = 28 + + for time_string in strings: + self.assertEqual(len(ts.between_time(*time_string)), + expected_length, + "%s - %s" % time_string) + def test_dti_constructor_preserve_dti_freq(self): rng = date_range('1/1/2000', '1/2/2000', freq='5min') diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index d27bddf8879db..111a848a28ad7 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -15,8 +15,7 @@ import pandas.tseries.offsets as offsets import pandas.util.testing as tm import pandas.compat as compat -from pandas.util.testing import assert_series_equal -import pandas.compat as compat +from pandas.util.testing import assert_series_equal, _skip_if_has_locale class TestTimestamp(tm.TestCase): @@ -617,6 +616,41 @@ def test_parsers_timestring(self): self.assertEqual(result4, exp_now) self.assertEqual(result5, exp_now) + def test_parsers_time(self): + # GH11818 + _skip_if_has_locale() + strings = ["14:15", "1415", "2:15pm", "0215pm", "14:15:00", "141500", + "2:15:00pm", "021500pm", datetime.time(14, 15)] + expected = datetime.time(14, 15) + + for time_string in strings: + self.assertEqual(tools.to_time(time_string), expected) + + new_string = "14.15" + self.assertRaises(ValueError, tools.to_time, new_string) + self.assertEqual(tools.to_time(new_string, format="%H.%M"), expected) + tools.add_time_format("%H.%M") + self.assertEqual(tools.to_time(new_string), expected) + + arg = ["14:15", "20:20"] + expected_arr = [datetime.time(14, 15), datetime.time(20, 20)] + self.assertEqual(tools.to_time(arg), expected_arr) + self.assertEqual(tools.to_time(arg, format="%H:%M"), expected_arr) + self.assertEqual(tools.to_time(arg, infer_time_format=True), + expected_arr) + self.assertEqual(tools.to_time(arg, format="%I:%M%p", errors="coerce"), + [None, None]) + self.assert_numpy_array_equal(tools.to_time(arg, format="%I:%M%p", + errors="ignore"), + np.array(arg)) + self.assertRaises(ValueError, lambda: tools.to_time(arg, + format="%I:%M%p", + errors="raise")) + self.assert_series_equal(tools.to_time(Series(arg, name="test")), + Series(expected_arr, name="test")) + self.assert_numpy_array_equal(tools.to_time(np.array(arg)), + np.array(expected_arr)) + def test_parsers_monthfreq(self): cases = {'201101': datetime.datetime(2011, 1, 1, 0, 0), '200005': datetime.datetime(2000, 5, 1, 0, 0)} diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 995c920358b9f..f2233d19faff9 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -1,5 +1,4 @@ -from datetime import datetime, timedelta -import re +from datetime import datetime, timedelta, time import sys import numpy as np @@ -7,7 +6,6 @@ import pandas.lib as lib import pandas.tslib as tslib import pandas.core.common as com -from pandas.compat import StringIO, callable from pandas.core.common import ABCIndexClass import pandas.compat as compat from pandas.util.decorators import deprecate_kwarg @@ -33,7 +31,7 @@ if hasattr(_timelex, 'split'): def _lexer_split_from_str(dt_str): # The StringIO(str(_)) is for dateutil 2.2 compatibility - return _timelex.split(StringIO(str(dt_str))) + return _timelex.split(compat.StringIO(str(dt_str))) _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str except (ImportError, AttributeError): @@ -68,7 +66,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, If True parses dates with the day first, eg 20/01/2005 Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug). - dt_str_parse : function, defaults to `compate.parse_date` (dateutil) + dt_str_parse : function, defaults to `compat.parse_date` (dateutil) This function should take in a datetime string and return a `datetime.datetime` guess that the datetime string represents dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) @@ -78,7 +76,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, Returns ------- - ret : datetime formatt string (for `strftime` or `strptime`) + ret : datetime format string (for `strftime` or `strptime`) """ if dt_str_parse is None or dt_str_split is None: return None @@ -329,9 +327,8 @@ def _convert_listlike(arg, box, format, name=None): # special case format_is_iso8601 = ( ('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or - '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and - format != '%Y' - ) + '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and + format != '%Y') if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None @@ -345,14 +342,15 @@ def _convert_listlike(arg, box, format, name=None): try: result = _attempt_YYYYMMDD(arg, errors=errors) except: - raise ValueError("cannot convert the input to '%Y%m%d' date format") + raise ValueError("cannot convert the input to " + "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime( arg, format, exact=exact, errors=errors) - except (tslib.OutOfBoundsDatetime): + except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg @@ -366,14 +364,17 @@ def _convert_listlike(arg, box, format, name=None): result = arg if result is None and (format is None or infer_datetime_format): - result = tslib.array_to_datetime(arg, errors=errors, - utc=utc, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq, - unit=unit, - require_iso8601=require_iso8601) + result = tslib.array_to_datetime( + arg, errors=errors, + utc=utc, dayfirst=dayfirst, + yearfirst=yearfirst, + freq=freq, unit=unit, + require_iso8601=require_iso8601) if com.is_datetime64_dtype(result) and box: - result = DatetimeIndex(result, tz='utc' if utc else None, name=name) + result = DatetimeIndex(result, + tz='utc' if utc else None, + name=name) return result except ValueError as e: @@ -400,24 +401,28 @@ def _convert_listlike(arg, box, format, name=None): def _attempt_YYYYMMDD(arg, errors): """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, - arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) + arg is a passed in as an object dtype, but could really be ints/strings + with nan-like/or floats (e.g. with nan) - Parameters - ---------- - arg : passed value - errors : 'raise','ignore','coerce' - """ + Parameters + ---------- + arg : passed value + errors : 'raise','ignore','coerce' + """ def calc(carg): # calculate the actual result carg = carg.astype(object) - return tslib.array_to_datetime(lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100), errors=errors) + return tslib.array_to_datetime( + lib.try_parse_year_month_day(carg/10000, carg/100 % 100, + carg % 100), errors=errors) - def calc_with_mask(carg,mask): + def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult[~mask] = tslib.iNaT - result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).astype('M8[ns]') + result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).\ + astype('M8[ns]') return result # try intlike / strings that are ints @@ -475,7 +480,8 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): if yearfirst is None: yearfirst = get_option("display.date_yearfirst") - return tslib.parse_datetime_string_with_reso(arg, freq=freq, dayfirst=dayfirst, + return tslib.parse_datetime_string_with_reso(arg, freq=freq, + dayfirst=dayfirst, yearfirst=yearfirst) @@ -483,6 +489,131 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): normalize_date = tslib.normalize_date +# Fixed time formats for time parsing +_time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p", + "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"] + + +def add_time_format(time_format): + _time_formats.append(time_format) + + +def _guess_time_format_for_array(arr): + # Try to guess the format based on the first non-NaN element + non_nan_elements = com.notnull(arr).nonzero()[0] + if len(non_nan_elements): + element = arr[non_nan_elements[0]] + for time_format in _time_formats: + try: + datetime.strptime(element, time_format) + return time_format + except ValueError: + pass + + return None + + +def to_time(arg, format=None, infer_time_format=False, errors='raise'): + """ + Parse time strings to time objects using fixed strptime formats ("%H:%M", + "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", + "%I%M%S%p") + + Use infer_time_format if all the strings are in the same format to speed + up conversion. + + Parameters + ---------- + arg : string in time format, datetime.time, list, tuple, 1-d array, Series + format : str, default None + Format used to convert arg into a time object. If None, fixed formats + are used. Use tools.add_time_format to add an additional fixed format. + infer_time_format: bool, default False + Infer the time format based on the first non-NaN element. If all + strings are in the same format, this will speed up conversion. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception + - If 'coerce', then invalid parsing will be set as None + - If 'ignore', then invalid parsing will return the input + + Returns + ------- + datetime.time + """ + from pandas.core.series import Series + + def _convert_listlike(arg, format): + + if isinstance(arg, (list, tuple)): + arg = np.array(arg, dtype='O') + + elif getattr(arg, 'ndim', 1) > 1: + raise TypeError('arg must be a string, datetime, list, tuple, ' + '1-d array, or Series') + + arg = com._ensure_object(arg) + + if infer_time_format and format is None: + format = _guess_time_format_for_array(arg) + + times = [] + if format is not None: + for element in arg: + try: + times.append(datetime.strptime(element, format).time()) + except (ValueError, TypeError): + if errors == 'raise': + raise ValueError("Cannot convert %s to a time with " + "given format %s" % (element, format)) + elif errors == 'ignore': + return arg + else: + times.append(None) + else: + formats = _time_formats[:] + format_found = False + for element in arg: + time_object = None + for time_format in formats: + try: + time_object = datetime.strptime(element, + time_format).time() + if not format_found: + # Put the found format in front + formats.insert(0, formats.pop( + formats.index(time_format))) + format_found = True + break + except (ValueError, TypeError): + continue + + if time_object is not None: + times.append(time_object) + elif errors == 'raise': + raise ValueError("Cannot convert arg %s to a time. Pass " + "in format or add default format." % arg) + elif errors == 'ignore': + return arg + else: + times.append(None) + + return times + + if arg is None: + return arg + elif isinstance(arg, time): + return arg + elif isinstance(arg, Series): + values = _convert_listlike(arg._values, format) + return Series(values, index=arg.index, name=arg.name) + elif isinstance(arg, ABCIndexClass): + return _convert_listlike(arg, format) + elif com.is_list_like(arg): + return _convert_listlike(arg, format) + + return _convert_listlike(np.array([arg]), format)[0] + + def format(dt): """Returns date in YYYYMMDD format.""" return dt.strftime('%Y%m%d') diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c01a7c1d2c240..d912e745788c8 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -303,11 +303,19 @@ def skip_if_no_ne(engine='numexpr'): "%s" % ne.__version__) +def _skip_if_has_locale(): + import locale + lang, _ = locale.getlocale() + if lang is not None: + import nose + raise nose.SkipTest("Specific locale is set {0}".format(lang)) -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # locale utilities -def check_output(*popenargs, **kwargs): # shamelessly taken from Python 2.7 source + +def check_output(*popenargs, **kwargs): + # shamelessly taken from Python 2.7 source r"""Run command with arguments and return its output as a byte string. If the exit code was non-zero it raises a CalledProcessError. The