From 4671aebbc6f79f07c3080f5e6ca49e80c8b36b23 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Thu, 19 Oct 2017 16:52:36 +0200 Subject: [PATCH 01/44] Recognize timezoned labels when accessing dataframes. --- pandas/core/indexes/datetimes.py | 35 +++++++++++++++----------- pandas/tests/indexing/test_datetime.py | 24 ++++++++++++++++++ 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d16251a7829b9..a01d5a6de0d2f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1273,52 +1273,57 @@ def _parsed_string_to_bounds(self, reso, parsed): lower, upper: pd.Timestamp """ + if parsed.tzinfo is None: + target_tz = self.tz + else: + target_tz = parsed.tzinfo + if reso == 'year': - return (Timestamp(datetime(parsed.year, 1, 1), tz=self.tz), + return (Timestamp(datetime(parsed.year, 1, 1), tz=target_tz), Timestamp(datetime(parsed.year, 12, 31, 23, - 59, 59, 999999), tz=self.tz)) + 59, 59, 999999), tz=target_tz)) elif reso == 'month': d = libts.monthrange(parsed.year, parsed.month)[1] return (Timestamp(datetime(parsed.year, parsed.month, 1), - tz=self.tz), + tz=target_tz), Timestamp(datetime(parsed.year, parsed.month, d, 23, - 59, 59, 999999), tz=self.tz)) + 59, 59, 999999), target_tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = libts.monthrange(parsed.year, qe)[1] # at end of month return (Timestamp(datetime(parsed.year, parsed.month, 1), - tz=self.tz), + tz=target_tz), Timestamp(datetime(parsed.year, qe, d, 23, 59, - 59, 999999), tz=self.tz)) + 59, 999999), tz=target_tz)) elif reso == 'day': st = datetime(parsed.year, parsed.month, parsed.day) - return (Timestamp(st, tz=self.tz), + return (Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Day(), - tz=self.tz).value - 1)) + tz=target_tz).value - 1)) elif reso == 'hour': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour) - return (Timestamp(st, tz=self.tz), + return (Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Hour(), - tz=self.tz).value - 1)) + tz=target_tz).value - 1)) elif reso == 'minute': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute) - return (Timestamp(st, tz=self.tz), + return (Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Minute(), - tz=self.tz).value - 1)) + tz=target_tz).value - 1)) elif reso == 'second': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute, second=parsed.second) - return (Timestamp(st, tz=self.tz), + return (Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Second(), - tz=self.tz).value - 1)) + tz=target_tz).value - 1)) elif reso == 'microsecond': st = datetime(parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute, parsed.second, parsed.microsecond) - return (Timestamp(st, tz=self.tz), Timestamp(st, tz=self.tz)) + return (Timestamp(st, tz=target_tz), Timestamp(st, tz=target_tz)) else: raise KeyError diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 617757c888eb5..e090ccd2595df 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -123,6 +123,30 @@ def test_consistency_with_tz_aware_scalar(self): result = df[0].at[0] assert result == expected + def test_access_datetimeindex_with_timezoned_label(self): + + idx = pd.DataFrame(index=pd.date_range('2016-01-01T00:00', '2016-03-31T23:59', freq='T')) + + former_naive_endpoint_idx = idx[ + "2016-01-01T00:00-02:00" + : + "2016-01-01T02:03" + ] + + former_non_naive_endpoint_idx = idx[ + pd.Timestamp("2016-01-01T00:00-02:00") + : + pd.Timestamp("2016-01-01T02:03") + ] + + assert len(former_naive_endpoint_idx) == len(former_non_naive_endpoint_idx) + + assert former_naive_endpoint_idx.iloc[0].name == former_non_naive_endpoint_idx.iloc[0].name + assert former_naive_endpoint_idx.iloc[1].name == former_non_naive_endpoint_idx.iloc[1].name + assert former_naive_endpoint_idx.iloc[2].name == former_non_naive_endpoint_idx.iloc[2].name + assert former_naive_endpoint_idx.iloc[3].name == former_non_naive_endpoint_idx.iloc[3].name + + def test_indexing_with_datetimeindex_tz(self): # GH 12050 From 69b517ef19df32554409626e85361b808fad4190 Mon Sep 17 00:00:00 2001 From: Marvin Kastner Date: Tue, 31 Oct 2017 22:49:27 +0100 Subject: [PATCH 02/44] Make `test_access_datetimeindex_with_timezoned_label` PEP08 compliant. --- pandas/tests/indexing/test_datetime.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index e090ccd2595df..648b33e9ecc3f 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -125,7 +125,10 @@ def test_consistency_with_tz_aware_scalar(self): def test_access_datetimeindex_with_timezoned_label(self): - idx = pd.DataFrame(index=pd.date_range('2016-01-01T00:00', '2016-03-31T23:59', freq='T')) + # 6785, timezone was ignored when simple string was provided as a label + + idx = pd.DataFrame(index=pd.date_range('2016-01-01T00:00', + '2016-03-31T23:59', freq='T')) former_naive_endpoint_idx = idx[ "2016-01-01T00:00-02:00" @@ -139,13 +142,20 @@ def test_access_datetimeindex_with_timezoned_label(self): pd.Timestamp("2016-01-01T02:03") ] - assert len(former_naive_endpoint_idx) == len(former_non_naive_endpoint_idx) + assert (len(former_naive_endpoint_idx) + == len(former_non_naive_endpoint_idx)) + + assert (former_naive_endpoint_idx.iloc[0].name + == former_non_naive_endpoint_idx.iloc[0].name) + + assert (former_naive_endpoint_idx.iloc[1].name + == former_non_naive_endpoint_idx.iloc[1].name) - assert former_naive_endpoint_idx.iloc[0].name == former_non_naive_endpoint_idx.iloc[0].name - assert former_naive_endpoint_idx.iloc[1].name == former_non_naive_endpoint_idx.iloc[1].name - assert former_naive_endpoint_idx.iloc[2].name == former_non_naive_endpoint_idx.iloc[2].name - assert former_naive_endpoint_idx.iloc[3].name == former_non_naive_endpoint_idx.iloc[3].name + assert (former_naive_endpoint_idx.iloc[2].name + == former_non_naive_endpoint_idx.iloc[2].name) + assert (former_naive_endpoint_idx.iloc[3].name + == former_non_naive_endpoint_idx.iloc[3].name) def test_indexing_with_datetimeindex_tz(self): From 6532e76b6bd12254c312b85398bac55fa7616ea1 Mon Sep 17 00:00:00 2001 From: Marvin Kastner Date: Tue, 31 Oct 2017 23:02:36 +0100 Subject: [PATCH 03/44] add translate function for converting time zones. --- pandas/core/indexes/datetimes.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d4a90eb7521d0..bc4fe0060dce0 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1276,52 +1276,64 @@ def _parsed_string_to_bounds(self, reso, parsed): else: target_tz = parsed.tzinfo + def translate(timestamp_lower, timestamp_upper): + if target_tz == self.tz: + return timestamp_lower, timestamp_upper + else: + return ( + timestamp_lower.tz_convert(self.tz), + timestamp_upper.tz_convert(self.tz) + ) + if reso == 'year': - return (Timestamp(datetime(parsed.year, 1, 1), tz=target_tz), + return translate(Timestamp(datetime(parsed.year, 1, 1), tz=target_tz), Timestamp(datetime(parsed.year, 12, 31, 23, 59, 59, 999999), tz=target_tz)) elif reso == 'month': d = libts.monthrange(parsed.year, parsed.month)[1] - return (Timestamp(datetime(parsed.year, parsed.month, 1), + return translate(Timestamp(datetime(parsed.year, parsed.month, 1), tz=target_tz), Timestamp(datetime(parsed.year, parsed.month, d, 23, 59, 59, 999999), target_tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = libts.monthrange(parsed.year, qe)[1] # at end of month - return (Timestamp(datetime(parsed.year, parsed.month, 1), + return translate(Timestamp(datetime(parsed.year, parsed.month, 1), tz=target_tz), Timestamp(datetime(parsed.year, qe, d, 23, 59, 59, 999999), tz=target_tz)) elif reso == 'day': st = datetime(parsed.year, parsed.month, parsed.day) - return (Timestamp(st, tz=target_tz), + return translate(Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Day(), tz=target_tz).value - 1)) elif reso == 'hour': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour) - return (Timestamp(st, tz=target_tz), + return translate(Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Hour(), tz=target_tz).value - 1)) elif reso == 'minute': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute) - return (Timestamp(st, tz=target_tz), + return translate(Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Minute(), tz=target_tz).value - 1)) elif reso == 'second': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute, second=parsed.second) - return (Timestamp(st, tz=target_tz), + return translate(Timestamp(st, tz=target_tz), Timestamp(Timestamp(st + offsets.Second(), tz=target_tz).value - 1)) elif reso == 'microsecond': st = datetime(parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute, parsed.second, parsed.microsecond) - return (Timestamp(st, tz=target_tz), Timestamp(st, tz=target_tz)) + return translate( + Timestamp(st, tz=target_tz), + Timestamp(st, tz=target_tz) + ) else: raise KeyError From c354271713842a522c0909701596481593e4ea17 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Oct 2017 17:09:28 -0700 Subject: [PATCH 04/44] Move NaT to self-contained module (#18014) --- pandas/_libs/period.pyx | 5 +- pandas/_libs/src/inference.pyx | 2 +- pandas/_libs/src/ujson/python/objToJSON.c | 11 +- pandas/_libs/tslib.pxd | 1 - pandas/_libs/tslib.pyx | 358 +------------- pandas/_libs/tslibs/nattype.pxd | 6 + pandas/_libs/tslibs/nattype.pyx | 546 ++++++++++++++++++++++ pandas/_libs/tslibs/strptime.pyx | 13 +- pandas/_libs/tslibs/timedeltas.pyx | 7 +- pandas/compat/pickle_compat.py | 8 +- pandas/core/tools/datetimes.py | 2 +- pandas/tslib.py | 4 +- setup.py | 2 + 13 files changed, 589 insertions(+), 376 deletions(-) create mode 100644 pandas/_libs/tslibs/nattype.pxd create mode 100644 pandas/_libs/tslibs/nattype.pyx diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 76664e276c634..4b8c86ae9d4b2 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -30,10 +30,11 @@ from pandas._libs import tslib from pandas._libs.tslib import Timestamp, iNaT, NaT from tslibs.timezones cimport ( is_utc, is_tzlocal, get_utcoffset, get_dst_info, maybe_get_tz) -from tslib cimport _nat_scalar_rules from tslibs.parsing import parse_time_string, NAT_SENTINEL from tslibs.frequencies cimport get_freq_code +from tslibs.nattype import nat_strings +from tslibs.nattype cimport _nat_scalar_rules from pandas.tseries import offsets from pandas.tseries import frequencies @@ -1174,7 +1175,7 @@ class Period(_Period): converted = other.asfreq(freq) ordinal = converted.ordinal - elif is_null_datetimelike(value) or value in tslib._nat_strings: + elif is_null_datetimelike(value) or value in nat_strings: ordinal = iNaT elif is_string_object(value) or util.is_integer_object(value): diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 46c4a6db0b67c..8fab825eae428 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -2,7 +2,7 @@ import sys from decimal import Decimal cimport util cimport cython -from tslib import NaT +from tslibs.nattype import NaT from tslib cimport convert_to_tsobject, convert_to_timedelta64 from tslibs.timezones cimport get_timezone from datetime import datetime, timedelta diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index ae7854dfc1427..f799b7f6b4785 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -162,7 +162,7 @@ void initObjToJSON(void) #endif { PyObject *mod_pandas; - PyObject *mod_tslib; + PyObject *mod_nattype; PyObject *mod_decimal = PyImport_ImportModule("decimal"); type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); Py_INCREF(type_decimal); @@ -180,10 +180,11 @@ void initObjToJSON(void) Py_DECREF(mod_pandas); } - mod_tslib = PyImport_ImportModule("pandas._libs.tslib"); - if (mod_tslib) { - cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_tslib, "NaTType"); - Py_DECREF(mod_tslib); + mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); + if (mod_nattype) { + cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_nattype, + "NaTType"); + Py_DECREF(mod_nattype); } /* Initialise numpy API and use 2/3 compatible return */ diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd index 147320b108cc8..5ceff32cfbac7 100644 --- a/pandas/_libs/tslib.pxd +++ b/pandas/_libs/tslib.pxd @@ -2,7 +2,6 @@ from numpy cimport ndarray, int64_t cdef convert_to_tsobject(object, object, object, bint, bint) cpdef convert_to_timedelta64(object, object) -cdef bint _nat_scalar_rules[6] cdef bint _check_all_nulls(obj) cdef _to_i8(object val) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9c3068ab023a2..025533b29366f 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -99,6 +99,9 @@ from tslibs.conversion import ( tz_localize_to_utc, tz_convert, tz_convert_single) +from tslibs.nattype import NaT, nat_strings +from tslibs.nattype cimport _checknull_with_nat + cdef inline object create_timestamp_from_ts( int64_t value, pandas_datetimestruct dts, @@ -804,228 +807,7 @@ class Timestamp(_Timestamp): return self + other -_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) - - -def _make_nat_func(func_name, cls): - def f(*args, **kwargs): - return NaT - f.__name__ = func_name - f.__doc__ = getattr(cls, func_name).__doc__ - return f - - -def _make_nan_func(func_name, cls): - def f(*args, **kwargs): - return np.nan - f.__name__ = func_name - f.__doc__ = getattr(cls, func_name).__doc__ - return f - - -def _make_error_func(func_name, cls): - def f(*args, **kwargs): - raise ValueError("NaTType does not support " + func_name) - - f.__name__ = func_name - if cls is not None: - f.__doc__ = getattr(cls, func_name).__doc__ - return f - - -class NaTType(_NaT): - """(N)ot-(A)-(T)ime, the time equivalent of NaN""" - - def __new__(cls): - cdef _NaT base - - base = _NaT.__new__(cls, 1, 1, 1) - base.value = NPY_NAT - base.freq = None - - return base - - def __repr__(self): - return 'NaT' - - def __str__(self): - return 'NaT' - - def isoformat(self, sep='T'): - # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. - return 'NaT' - - def __hash__(self): - return NPY_NAT - - def __int__(self): - return NPY_NAT - - def __long__(self): - return NPY_NAT - - def __reduce_ex__(self, protocol): - # python 3.6 compat - # http://bugs.python.org/issue28730 - # now __reduce_ex__ is defined and higher priority than __reduce__ - return self.__reduce__() - - def __reduce__(self): - return (__nat_unpickle, (None, )) - - def total_seconds(self): - """ - Total duration of timedelta in seconds (to ns precision) - """ - # GH 10939 - return np.nan - - @property - def is_leap_year(self): - return False - - @property - def is_month_start(self): - return False - - @property - def is_quarter_start(self): - return False - - @property - def is_year_start(self): - return False - - @property - def is_month_end(self): - return False - - @property - def is_quarter_end(self): - return False - - @property - def is_year_end(self): - return False - - def __rdiv__(self, other): - return _nat_rdivide_op(self, other) - - def __rtruediv__(self, other): - return _nat_rdivide_op(self, other) - - def __rfloordiv__(self, other): - return _nat_rdivide_op(self, other) - - def __rmul__(self, other): - if is_integer_object(other) or is_float_object(other): - return NaT - return NotImplemented - - # ---------------------------------------------------------------------- - # inject the Timestamp field properties - # these by definition return np.nan - - year = property(fget=lambda self: np.nan) - quarter = property(fget=lambda self: np.nan) - month = property(fget=lambda self: np.nan) - day = property(fget=lambda self: np.nan) - hour = property(fget=lambda self: np.nan) - minute = property(fget=lambda self: np.nan) - second = property(fget=lambda self: np.nan) - millisecond = property(fget=lambda self: np.nan) - microsecond = property(fget=lambda self: np.nan) - nanosecond = property(fget=lambda self: np.nan) - - week = property(fget=lambda self: np.nan) - dayofyear = property(fget=lambda self: np.nan) - weekofyear = property(fget=lambda self: np.nan) - days_in_month = property(fget=lambda self: np.nan) - daysinmonth = property(fget=lambda self: np.nan) - dayofweek = property(fget=lambda self: np.nan) - weekday_name = property(fget=lambda self: np.nan) - - # inject Timedelta properties - days = property(fget=lambda self: np.nan) - seconds = property(fget=lambda self: np.nan) - microseconds = property(fget=lambda self: np.nan) - nanoseconds = property(fget=lambda self: np.nan) - - # inject pd.Period properties - qyear = property(fget=lambda self: np.nan) - - # ---------------------------------------------------------------------- - # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or - # return NaT create functions that raise, for binding to NaTType - # These are the ones that can get their docstrings from datetime. - - # nan methods - weekday = _make_nan_func('weekday', datetime) - isoweekday = _make_nan_func('isoweekday', datetime) - - # _nat_methods - date = _make_nat_func('date', datetime) - - utctimetuple = _make_error_func('utctimetuple', datetime) - timetz = _make_error_func('timetz', datetime) - timetuple = _make_error_func('timetuple', datetime) - strptime = _make_error_func('strptime', datetime) - strftime = _make_error_func('strftime', datetime) - isocalendar = _make_error_func('isocalendar', datetime) - dst = _make_error_func('dst', datetime) - ctime = _make_error_func('ctime', datetime) - time = _make_error_func('time', datetime) - toordinal = _make_error_func('toordinal', datetime) - tzname = _make_error_func('tzname', datetime) - utcoffset = _make_error_func('utcoffset', datetime) - - # Timestamp has empty docstring for some methods. - utcfromtimestamp = _make_error_func('utcfromtimestamp', None) - fromtimestamp = _make_error_func('fromtimestamp', None) - combine = _make_error_func('combine', None) - utcnow = _make_error_func('utcnow', None) - - timestamp = _make_error_func('timestamp', Timestamp) - - # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or - # return NaT create functions that raise, for binding to NaTType - astimezone = _make_error_func('astimezone', Timestamp) - fromordinal = _make_error_func('fromordinal', Timestamp) - - # _nat_methods - to_pydatetime = _make_nat_func('to_pydatetime', Timestamp) - - now = _make_nat_func('now', Timestamp) - today = _make_nat_func('today', Timestamp) - round = _make_nat_func('round', Timestamp) - floor = _make_nat_func('floor', Timestamp) - ceil = _make_nat_func('ceil', Timestamp) - - tz_convert = _make_nat_func('tz_convert', Timestamp) - tz_localize = _make_nat_func('tz_localize', Timestamp) - replace = _make_nat_func('replace', Timestamp) - - def to_datetime(self): - """ - DEPRECATED: use :meth:`to_pydatetime` instead. - - Convert a Timestamp object to a native Python datetime object. - """ - warnings.warn("to_datetime is deprecated. Use self.to_pydatetime()", - FutureWarning, stacklevel=2) - return self.to_pydatetime(warn=False) - - -def __nat_unpickle(*args): - # return constant defined in the module - return NaT - -NaT = NaTType() - -cdef inline bint _checknull_with_nat(object val): - """ utility to check if a value is a nat or not """ - return val is None or ( - PyFloat_Check(val) and val != val) or val is NaT +# ---------------------------------------------------------------------- cdef inline bint _check_all_nulls(object val): """ utility to check if a value is any type of null """ @@ -1044,9 +826,6 @@ cdef inline bint _check_all_nulls(object val): res = 0 return res -cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1: - return _nat_scalar_rules[op] - cpdef object get_value_box(ndarray arr, object loc): cdef: @@ -1163,7 +942,7 @@ cdef class _Timestamp(datetime): if isinstance(other, _Timestamp): ots = other elif other is NaT: - return _cmp_nat_dt(other, self, _reverse_ops[op]) + return op == Py_NE elif PyDateTime_Check(other): if self.nanosecond == 0: val = self.to_pydatetime() @@ -1455,123 +1234,6 @@ cdef inline bint is_timestamp(object o): return Py_TYPE(o) == ts_type # isinstance(o, Timestamp) -cdef bint _nat_scalar_rules[6] - -_nat_scalar_rules[Py_EQ] = False -_nat_scalar_rules[Py_NE] = True -_nat_scalar_rules[Py_LT] = False -_nat_scalar_rules[Py_LE] = False -_nat_scalar_rules[Py_GT] = False -_nat_scalar_rules[Py_GE] = False - - -cdef _nat_divide_op(self, other): - if PyDelta_Check(other) or is_timedelta64_object(other) or other is NaT: - return np.nan - if is_integer_object(other) or is_float_object(other): - return NaT - return NotImplemented - -cdef _nat_rdivide_op(self, other): - if PyDelta_Check(other): - return np.nan - return NotImplemented - - -cdef class _NaT(datetime): - cdef readonly: - int64_t value - object freq - - def __hash__(_NaT self): - # py3k needs this defined here - return hash(self.value) - - def __richcmp__(_NaT self, object other, int op): - cdef int ndim = getattr(other, 'ndim', -1) - - if ndim == -1: - return _nat_scalar_rules[op] - - if ndim == 0: - if is_datetime64_object(other): - return _nat_scalar_rules[op] - else: - raise TypeError('Cannot compare type %r with type %r' % - (type(self).__name__, type(other).__name__)) - return PyObject_RichCompare(other, self, _reverse_ops[op]) - - def __add__(self, other): - if PyDateTime_Check(other): - return NaT - - elif hasattr(other, 'delta'): - # Timedelta, offsets.Tick, offsets.Week - return NaT - elif getattr(other, '_typ', None) in ['dateoffset', 'series', - 'period', 'datetimeindex', - 'timedeltaindex']: - # Duplicate logic in _Timestamp.__add__ to avoid needing - # to subclass; allows us to @final(_Timestamp.__add__) - return NotImplemented - return NaT - - def __sub__(self, other): - # Duplicate some logic from _Timestamp.__sub__ to avoid needing - # to subclass; allows us to @final(_Timestamp.__sub__) - if PyDateTime_Check(other): - return NaT - elif PyDelta_Check(other): - return NaT - - elif getattr(other, '_typ', None) == 'datetimeindex': - # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex - return -other.__sub__(self) - - elif getattr(other, '_typ', None) == 'timedeltaindex': - # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex - return (-other).__add__(self) - - elif hasattr(other, 'delta'): - # offsets.Tick, offsets.Week - neg_other = -other - return self + neg_other - - elif getattr(other, '_typ', None) in ['period', - 'periodindex', 'dateoffset']: - return NotImplemented - - return NaT - - def __pos__(self): - return NaT - - def __neg__(self): - return NaT - - def __div__(self, other): - return _nat_divide_op(self, other) - - def __truediv__(self, other): - return _nat_divide_op(self, other) - - def __floordiv__(self, other): - return _nat_divide_op(self, other) - - def __mul__(self, other): - if is_integer_object(other) or is_float_object(other): - return NaT - return NotImplemented - - @property - def asm8(self): - return np.datetime64(NPY_NAT, 'ns') - - def to_datetime64(self): - """ Returns a numpy.datetime64 object with 'ns' precision """ - return np.datetime64('NaT') - - # helper to extract datetime and int64 from several different possibilities cdef convert_to_tsobject(object ts, object tz, object unit, bint dayfirst, bint yearfirst): @@ -1732,7 +1394,7 @@ cdef convert_str_to_tsobject(object ts, object tz, object unit, assert is_string_object(ts) - if len(ts) == 0 or ts in _nat_strings: + if len(ts) == 0 or ts in nat_strings: ts = NaT elif ts == 'now': # Issue 9000, we short-circuit rather than going @@ -2055,7 +1717,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): iresult[i] = NPY_NAT elif is_string_object(val): - if len(val) == 0 or val in _nat_strings: + if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT else: @@ -2116,7 +1778,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): oresult[i] = val elif is_string_object(val): - if len(val) == 0 or val in _nat_strings: + if len(val) == 0 or val in nat_strings: oresult[i] = NaT else: @@ -2234,7 +1896,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', # string try: - if len(val) == 0 or val in _nat_strings: + if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT continue @@ -2336,7 +1998,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', oresult[i] = val elif is_string_object(val): - if len(val) == 0 or val in _nat_strings: + if len(val) == 0 or val in nat_strings: oresult[i] = 'NaT' continue diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd new file mode 100644 index 0000000000000..7ded36bb1bdc0 --- /dev/null +++ b/pandas/_libs/tslibs/nattype.pxd @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +cdef bint _nat_scalar_rules[6] + +cdef bint _checknull_with_nat(object val) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx new file mode 100644 index 0000000000000..dedc115501cd0 --- /dev/null +++ b/pandas/_libs/tslibs/nattype.pyx @@ -0,0 +1,546 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +import warnings + +from cpython cimport ( + PyFloat_Check, PyComplex_Check, + PyObject_RichCompare, + Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE) + +from cpython.datetime cimport (datetime, + PyDateTime_Check, PyDelta_Check, + PyDateTime_IMPORT) +PyDateTime_IMPORT + +import numpy as np +cimport numpy as np +from numpy cimport int64_t +np.import_array() + +from util cimport (get_nat, + is_integer_object, is_float_object, + is_datetime64_object, is_timedelta64_object) + +# ---------------------------------------------------------------------- +# Constants +nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + +cdef int64_t NPY_NAT = get_nat() + +cdef bint _nat_scalar_rules[6] +_nat_scalar_rules[Py_EQ] = False +_nat_scalar_rules[Py_NE] = True +_nat_scalar_rules[Py_LT] = False +_nat_scalar_rules[Py_LE] = False +_nat_scalar_rules[Py_GT] = False +_nat_scalar_rules[Py_GE] = False + +# ---------------------------------------------------------------------- + + +def _make_nan_func(func_name, cls): + def f(*args, **kwargs): + return np.nan + f.__name__ = func_name + f.__doc__ = getattr(cls, func_name).__doc__ + return f + + +def _make_nat_func(func_name, cls): + def f(*args, **kwargs): + return NaT + + f.__name__ = func_name + if isinstance(cls, str): + # passed the literal docstring directly + f.__doc__ = cls + else: + f.__doc__ = getattr(cls, func_name).__doc__ + return f + + +def _make_error_func(func_name, cls): + def f(*args, **kwargs): + raise ValueError("NaTType does not support " + func_name) + + f.__name__ = func_name + if isinstance(cls, str): + # passed the literal docstring directly + f.__doc__ = cls + elif cls is not None: + f.__doc__ = getattr(cls, func_name).__doc__ + return f + + +cdef _nat_divide_op(self, other): + if PyDelta_Check(other) or is_timedelta64_object(other) or other is NaT: + return np.nan + if is_integer_object(other) or is_float_object(other): + return NaT + return NotImplemented + + +cdef _nat_rdivide_op(self, other): + if PyDelta_Check(other): + return np.nan + return NotImplemented + + +def __nat_unpickle(*args): + # return constant defined in the module + return NaT + +# ---------------------------------------------------------------------- + + +cdef class _NaT(datetime): + cdef readonly: + int64_t value + object freq + + def __hash__(_NaT self): + # py3k needs this defined here + return hash(self.value) + + def __richcmp__(_NaT self, object other, int op): + cdef int ndim = getattr(other, 'ndim', -1) + + if ndim == -1: + return _nat_scalar_rules[op] + + if ndim == 0: + if is_datetime64_object(other): + return _nat_scalar_rules[op] + else: + raise TypeError('Cannot compare type %r with type %r' % + (type(self).__name__, type(other).__name__)) + # Note: instead of passing "other, self, _reverse_ops[op]", we observe + # that `_nat_scalar_rules` is invariant under `_reverse_ops`, + # rendering it unnecessary. + return PyObject_RichCompare(other, self, op) + + def __add__(self, other): + if PyDateTime_Check(other): + return NaT + + elif hasattr(other, 'delta'): + # Timedelta, offsets.Tick, offsets.Week + return NaT + elif getattr(other, '_typ', None) in ['dateoffset', 'series', + 'period', 'datetimeindex', + 'timedeltaindex']: + # Duplicate logic in _Timestamp.__add__ to avoid needing + # to subclass; allows us to @final(_Timestamp.__add__) + return NotImplemented + return NaT + + def __sub__(self, other): + # Duplicate some logic from _Timestamp.__sub__ to avoid needing + # to subclass; allows us to @final(_Timestamp.__sub__) + if PyDateTime_Check(other): + return NaT + elif PyDelta_Check(other): + return NaT + + elif getattr(other, '_typ', None) == 'datetimeindex': + # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex + return -other.__sub__(self) + + elif getattr(other, '_typ', None) == 'timedeltaindex': + # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex + return (-other).__add__(self) + + elif hasattr(other, 'delta'): + # offsets.Tick, offsets.Week + neg_other = -other + return self + neg_other + + elif getattr(other, '_typ', None) in ['period', + 'periodindex', 'dateoffset']: + return NotImplemented + + return NaT + + def __pos__(self): + return NaT + + def __neg__(self): + return NaT + + def __div__(self, other): + return _nat_divide_op(self, other) + + def __truediv__(self, other): + return _nat_divide_op(self, other) + + def __floordiv__(self, other): + return _nat_divide_op(self, other) + + def __mul__(self, other): + if is_integer_object(other) or is_float_object(other): + return NaT + return NotImplemented + + @property + def asm8(self): + return np.datetime64(NPY_NAT, 'ns') + + def to_datetime64(self): + """ Returns a numpy.datetime64 object with 'ns' precision """ + return np.datetime64('NaT') + + +class NaTType(_NaT): + """(N)ot-(A)-(T)ime, the time equivalent of NaN""" + + def __new__(cls): + cdef _NaT base + + base = _NaT.__new__(cls, 1, 1, 1) + base.value = NPY_NAT + base.freq = None + + return base + + def __repr__(self): + return 'NaT' + + def __str__(self): + return 'NaT' + + def isoformat(self, sep='T'): + # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. + return 'NaT' + + def __hash__(self): + return NPY_NAT + + def __int__(self): + return NPY_NAT + + def __long__(self): + return NPY_NAT + + def __reduce_ex__(self, protocol): + # python 3.6 compat + # http://bugs.python.org/issue28730 + # now __reduce_ex__ is defined and higher priority than __reduce__ + return self.__reduce__() + + def __reduce__(self): + return (__nat_unpickle, (None, )) + + def total_seconds(self): + """ + Total duration of timedelta in seconds (to ns precision) + """ + # GH 10939 + return np.nan + + @property + def is_leap_year(self): + return False + + @property + def is_month_start(self): + return False + + @property + def is_quarter_start(self): + return False + + @property + def is_year_start(self): + return False + + @property + def is_month_end(self): + return False + + @property + def is_quarter_end(self): + return False + + @property + def is_year_end(self): + return False + + def __rdiv__(self, other): + return _nat_rdivide_op(self, other) + + def __rtruediv__(self, other): + return _nat_rdivide_op(self, other) + + def __rfloordiv__(self, other): + return _nat_rdivide_op(self, other) + + def __rmul__(self, other): + if is_integer_object(other) or is_float_object(other): + return NaT + return NotImplemented + + # ---------------------------------------------------------------------- + # inject the Timestamp field properties + # these by definition return np.nan + + year = property(fget=lambda self: np.nan) + quarter = property(fget=lambda self: np.nan) + month = property(fget=lambda self: np.nan) + day = property(fget=lambda self: np.nan) + hour = property(fget=lambda self: np.nan) + minute = property(fget=lambda self: np.nan) + second = property(fget=lambda self: np.nan) + millisecond = property(fget=lambda self: np.nan) + microsecond = property(fget=lambda self: np.nan) + nanosecond = property(fget=lambda self: np.nan) + + week = property(fget=lambda self: np.nan) + dayofyear = property(fget=lambda self: np.nan) + weekofyear = property(fget=lambda self: np.nan) + days_in_month = property(fget=lambda self: np.nan) + daysinmonth = property(fget=lambda self: np.nan) + dayofweek = property(fget=lambda self: np.nan) + weekday_name = property(fget=lambda self: np.nan) + + # inject Timedelta properties + days = property(fget=lambda self: np.nan) + seconds = property(fget=lambda self: np.nan) + microseconds = property(fget=lambda self: np.nan) + nanoseconds = property(fget=lambda self: np.nan) + + # inject pd.Period properties + qyear = property(fget=lambda self: np.nan) + + # ---------------------------------------------------------------------- + # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or + # return NaT create functions that raise, for binding to NaTType + # These are the ones that can get their docstrings from datetime. + + # nan methods + weekday = _make_nan_func('weekday', datetime) + isoweekday = _make_nan_func('isoweekday', datetime) + + # _nat_methods + date = _make_nat_func('date', datetime) + + utctimetuple = _make_error_func('utctimetuple', datetime) + timetz = _make_error_func('timetz', datetime) + timetuple = _make_error_func('timetuple', datetime) + strptime = _make_error_func('strptime', datetime) + strftime = _make_error_func('strftime', datetime) + isocalendar = _make_error_func('isocalendar', datetime) + dst = _make_error_func('dst', datetime) + ctime = _make_error_func('ctime', datetime) + time = _make_error_func('time', datetime) + toordinal = _make_error_func('toordinal', datetime) + tzname = _make_error_func('tzname', datetime) + utcoffset = _make_error_func('utcoffset', datetime) + + # Timestamp has empty docstring for some methods. + utcfromtimestamp = _make_error_func('utcfromtimestamp', None) + fromtimestamp = _make_error_func('fromtimestamp', None) + combine = _make_error_func('combine', None) + utcnow = _make_error_func('utcnow', None) + + # ---------------------------------------------------------------------- + # The remaining methods have docstrings copy/pasted from the analogous + # Timestamp methods. + + timestamp = _make_error_func('timestamp', # noqa:E128 + """Return POSIX timestamp as float.""") + + # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or + # return NaT create functions that raise, for binding to NaTType + astimezone = _make_error_func('astimezone', # noqa:E128 + """ + Convert tz-aware Timestamp to another time zone. + + Parameters + ---------- + tz : string, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time which Timestamp will be converted to. + None will remove timezone holding UTC time. + + Returns + ------- + converted : Timestamp + + Raises + ------ + TypeError + If Timestamp is tz-naive. + """) + fromordinal = _make_error_func('fromordinal', # noqa:E128 + """ + passed an ordinal, translate and convert to a ts + note: by definition there cannot be any tz info on the ordinal itself + + Parameters + ---------- + ordinal : int + date corresponding to a proleptic Gregorian ordinal + freq : str, DateOffset + Offset which Timestamp will have + tz : string, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time which Timestamp will have. + offset : str, DateOffset + Deprecated, use freq + """) + + # _nat_methods + to_pydatetime = _make_nat_func('to_pydatetime', # noqa:E128 + """ + Convert a Timestamp object to a native Python datetime object. + + If warn=True, issue a warning if nanoseconds is nonzero. + """) + + now = _make_nat_func('now', # noqa:E128 + """ + Return the current time in the local timezone. Equivalent + to datetime.now([tz]) + + Parameters + ---------- + tz : string / timezone object, default None + Timezone to localize to + """) + today = _make_nat_func('today', # noqa:E128 + """ + Return the current time in the local timezone. This differs + from datetime.today() in that it can be localized to a + passed timezone. + + Parameters + ---------- + tz : string / timezone object, default None + Timezone to localize to + """) + round = _make_nat_func('round', # noqa:E128 + """ + Round the Timestamp to the specified resolution + + Returns + ------- + a new Timestamp rounded to the given resolution of `freq` + + Parameters + ---------- + freq : a freq string indicating the rounding resolution + + Raises + ------ + ValueError if the freq cannot be converted + """) + floor = _make_nat_func('floor', # noqa:E128 + """ + return a new Timestamp floored to this resolution + + Parameters + ---------- + freq : a freq string indicating the flooring resolution + """) + ceil = _make_nat_func('ceil', # noqa:E128 + """ + return a new Timestamp ceiled to this resolution + + Parameters + ---------- + freq : a freq string indicating the ceiling resolution + """) + + tz_convert = _make_nat_func('tz_convert', # noqa:E128 + """ + Convert tz-aware Timestamp to another time zone. + + Parameters + ---------- + tz : string, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time which Timestamp will be converted to. + None will remove timezone holding UTC time. + + Returns + ------- + converted : Timestamp + + Raises + ------ + TypeError + If Timestamp is tz-naive. + """) + tz_localize = _make_nat_func('tz_localize', # noqa:E128 + """ + Convert naive Timestamp to local time zone, or remove + timezone from tz-aware Timestamp. + + Parameters + ---------- + tz : string, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time which Timestamp will be converted to. + None will remove timezone holding local time. + ambiguous : bool, 'NaT', default 'raise' + - bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates) + - 'NaT' will return NaT for an ambiguous time + - 'raise' will raise an AmbiguousTimeError for an ambiguous time + errors : 'raise', 'coerce', default 'raise' + - 'raise' will raise a NonExistentTimeError if a timestamp is not + valid in the specified timezone (e.g. due to a transition from + or to DST time) + - 'coerce' will return NaT if the timestamp can not be converted + into the specified timezone + + .. versionadded:: 0.19.0 + + Returns + ------- + localized : Timestamp + + Raises + ------ + TypeError + If the Timestamp is tz-aware and tz is not None. + """) + replace = _make_nat_func('replace', # noqa:E128 + """ + implements datetime.replace, handles nanoseconds + + Parameters + ---------- + year : int, optional + month : int, optional + day : int, optional + hour : int, optional + minute : int, optional + second : int, optional + microsecond : int, optional + nanosecond: int, optional + tzinfo : tz-convertible, optional + fold : int, optional, default is 0 + added in 3.6, NotImplemented + + Returns + ------- + Timestamp with fields replaced + """) + + def to_datetime(self): + """ + DEPRECATED: use :meth:`to_pydatetime` instead. + + Convert a Timestamp object to a native Python datetime object. + """ + warnings.warn("to_datetime is deprecated. Use self.to_pydatetime()", + FutureWarning, stacklevel=2) + return self.to_pydatetime(warn=False) + + +NaT = NaTType() + + +# ---------------------------------------------------------------------- + +cdef inline bint _checknull_with_nat(object val): + """ utility to check if a value is a nat or not """ + return val is None or ( + PyFloat_Check(val) and val != val) or val is NaT diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 673d45e9c15cb..a38aa37674e9e 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -40,15 +40,8 @@ from util cimport is_string_object, get_nat cdef int64_t NPY_NAT = get_nat() -cdef set _nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) - - -# TODO: Consolidate with other implementations -cdef inline bint _checknull_with_nat(object val): - """ utility to check if a value is a nat or not """ - return (val is None or - (PyFloat_Check(val) and val != val) or - (isinstance(val, datetime) and not val == val)) +from nattype cimport _checknull_with_nat +from nattype import nat_strings def array_strptime(ndarray[object] values, object fmt, @@ -146,7 +139,7 @@ def array_strptime(ndarray[object] values, object fmt, for i in range(n): val = values[i] if is_string_object(val): - if val in _nat_strings: + if val in nat_strings: iresult[i] = NPY_NAT continue else: diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 1785c85da4949..da1163e25f5c6 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -9,12 +9,11 @@ from numpy cimport int64_t cimport util +from nattype import nat_strings + # ---------------------------------------------------------------------- # Constants -# TODO: Get this from tslibs.nattype once available -_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) - cdef int64_t NPY_NAT = util.get_nat() cdef dict timedelta_abbrevs = { 'D': 'd', @@ -115,7 +114,7 @@ cdef inline parse_timedelta_string(object ts): # have_value : track if we have at least 1 leading unit # have_hhmmss : tracks if we have a regular format hh:mm:ss - if len(ts) == 0 or ts in _nat_strings: + if len(ts) == 0 or ts in nat_strings: return NPY_NAT # decode ts if necessary diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index f6223c48994ae..8015642919611 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -74,10 +74,14 @@ def load_reduce(self): ('pandas._libs.sparse', 'BlockIndex'), ('pandas.tslib', 'Timestamp'): ('pandas._libs.tslib', 'Timestamp'), - ('pandas.tslib', '__nat_unpickle'): - ('pandas._libs.tslib', '__nat_unpickle'), ('pandas._period', 'Period'): ('pandas._libs.period', 'Period'), + # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype + ('pandas.tslib', '__nat_unpickle'): + ('pandas._libs.tslibs.nattype', '__nat_unpickle'), + ('pandas._libs.tslib', '__nat_unpickle'): + ('pandas._libs.tslibs.nattype', '__nat_unpickle'), + # 15998 top-level dirs moving ('pandas.sparse.array', 'SparseArray'): ('pandas.core.sparse.array', 'SparseArray'), diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e335dfe3a4142..ae8aa275b2bae 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -535,7 +535,7 @@ def calc_with_mask(carg, mask): # string with NaN-like try: - mask = ~algorithms.isin(arg, list(tslib._nat_strings)) + mask = ~algorithms.isin(arg, list(tslib.nat_strings)) return calc_with_mask(arg, mask) except: pass diff --git a/pandas/tslib.py b/pandas/tslib.py index c960a4eaf59ad..c06b34c1b0483 100644 --- a/pandas/tslib.py +++ b/pandas/tslib.py @@ -3,5 +3,5 @@ import warnings warnings.warn("The pandas.tslib module is deprecated and will be " "removed in a future version.", FutureWarning, stacklevel=2) -from pandas._libs.tslib import (Timestamp, Timedelta, - NaT, NaTType, OutOfBoundsDatetime) +from pandas._libs.tslib import Timestamp, Timedelta, OutOfBoundsDatetime +from pandas._libs.tslibs.nattype import NaT, NaTType diff --git a/setup.py b/setup.py index e60ba18ae34d9..ed58329d5fd8f 100755 --- a/setup.py +++ b/setup.py @@ -516,6 +516,8 @@ def pxd(name): 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.frequencies': {'pyxfile': '_libs/tslibs/frequencies', 'pxdfiles': ['_libs/src/util']}, + '_libs.tslibs.nattype': {'pyxfile': '_libs/tslibs/nattype', + 'pxdfiles': ['_libs/src/util']}, '_libs.index': {'pyxfile': '_libs/index', 'sources': np_datetime_sources, 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], From a9202fb13a660a43ceb12d891ab4b82ef0ce5670 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Oct 2017 18:19:38 -0700 Subject: [PATCH 05/44] Separate out arithmetic tests for datetimelike indexes (#18049) --- .../indexes/datetimes/test_arithmetic.py | 335 ++++++++ .../tests/indexes/datetimes/test_datetime.py | 51 -- pandas/tests/indexes/datetimes/test_ops.py | 334 +------- .../tests/indexes/period/test_arithmetic.py | 435 ++++++++++ pandas/tests/indexes/period/test_ops.py | 406 +-------- .../indexes/timedeltas/test_arithmetic.py | 770 ++++++++++++++++++ .../tests/indexes/timedeltas/test_astype.py | 42 +- pandas/tests/indexes/timedeltas/test_ops.py | 639 +-------------- .../indexes/timedeltas/test_timedelta.py | 113 +-- pandas/tests/indexes/timedeltas/test_tools.py | 28 +- pandas/tests/scalar/test_timedelta.py | 29 + 11 files changed, 1625 insertions(+), 1557 deletions(-) create mode 100644 pandas/tests/indexes/datetimes/test_arithmetic.py create mode 100644 pandas/tests/indexes/period/test_arithmetic.py create mode 100644 pandas/tests/indexes/timedeltas/test_arithmetic.py diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py new file mode 100644 index 0000000000000..2f3d567599fa6 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -0,0 +1,335 @@ +# -*- coding: utf-8 -*- +import warnings +from datetime import datetime, timedelta + +import pytest + +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas.errors import PerformanceWarning +from pandas import (Timestamp, Timedelta, Series, + DatetimeIndex, TimedeltaIndex, + date_range) + + +class TestDatetimeIndexArithmetic(object): + tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', + 'dateutil/US/Pacific'] + + def test_add_iadd(self): + for tz in self.tz: + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) + result = rng + delta + expected = pd.date_range('2000-01-01 02:00', + '2000-02-01 02:00', tz=tz) + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + # int + rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, + tz=tz) + result = rng + 1 + expected = pd.date_range('2000-01-01 10:00', freq='H', periods=10, + tz=tz) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + idx = DatetimeIndex(['2011-01-01', '2011-01-02']) + msg = "cannot add DatetimeIndex and Timestamp" + with tm.assert_raises_regex(TypeError, msg): + idx + Timestamp('2011-01-01') + + with tm.assert_raises_regex(TypeError, msg): + Timestamp('2011-01-01') + idx + + def test_sub_isub(self): + for tz in self.tz: + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) + expected = pd.date_range('1999-12-31 22:00', + '2000-01-31 22:00', tz=tz) + + result = rng - delta + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + # int + rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, + tz=tz) + result = rng - 1 + expected = pd.date_range('2000-01-01 08:00', freq='H', periods=10, + tz=tz) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + @pytest.mark.parametrize('addend', [ + datetime(2011, 1, 1), + DatetimeIndex(['2011-01-01', '2011-01-02']), + DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize('US/Eastern'), + np.datetime64('2011-01-01'), + Timestamp('2011-01-01')]) + def test_add_datetimelike_and_dti(self, addend): + # GH#9631 + dti = DatetimeIndex(['2011-01-01', '2011-01-02']) + msg = 'cannot add DatetimeIndex and {0}'.format( + type(addend).__name__) + with tm.assert_raises_regex(TypeError, msg): + dti + addend + with tm.assert_raises_regex(TypeError, msg): + addend + dti + + @pytest.mark.parametrize('addend', [ + datetime(2011, 1, 1), + DatetimeIndex(['2011-01-01', '2011-01-02']), + DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize('US/Eastern'), + np.datetime64('2011-01-01'), + Timestamp('2011-01-01')]) + def test_add_datetimelike_and_dti_tz(self, addend): + # GH#9631 + dti_tz = DatetimeIndex(['2011-01-01', + '2011-01-02']).tz_localize('US/Eastern') + msg = 'cannot add DatetimeIndex and {0}'.format( + type(addend).__name__) + with tm.assert_raises_regex(TypeError, msg): + dti_tz + addend + with tm.assert_raises_regex(TypeError, msg): + addend + dti_tz + + def test_sub_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now changed to + # return subtraction -> TimeDeltaIndex (GH ...) + + dti = date_range('20130101', periods=3) + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') + expected = TimedeltaIndex([0, 0, 0]) + + result = dti - dti + tm.assert_index_equal(result, expected) + + result = dti_tz - dti_tz + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + dti_tz - dti + + with pytest.raises(TypeError): + dti - dti_tz + + with pytest.raises(TypeError): + dti_tz - dti_tz2 + + # isub + dti -= dti + tm.assert_index_equal(dti, expected) + + # different length raises ValueError + dti1 = date_range('20130101', periods=3) + dti2 = date_range('20130101', periods=4) + with pytest.raises(ValueError): + dti1 - dti2 + + # NaN propagation + dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) + dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) + expected = TimedeltaIndex(['1 days', np.nan, np.nan]) + result = dti2 - dti1 + tm.assert_index_equal(result, expected) + + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'D']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) + + with pytest.raises(TypeError): + idx - p + + with pytest.raises(TypeError): + p - idx + + def test_ufunc_coercions(self): + idx = date_range('2011-01-01', periods=3, freq='2D', name='x') + + delta = np.timedelta64(1, 'D') + for result in [idx + delta, np.add(idx, delta)]: + assert isinstance(result, DatetimeIndex) + exp = date_range('2011-01-02', periods=3, freq='2D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '2D' + + for result in [idx - delta, np.subtract(idx, delta)]: + assert isinstance(result, DatetimeIndex) + exp = date_range('2010-12-31', periods=3, freq='2D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '2D' + + delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), + np.timedelta64(3, 'D')]) + for result in [idx + delta, np.add(idx, delta)]: + assert isinstance(result, DatetimeIndex) + exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], + freq='3D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '3D' + + for result in [idx - delta, np.subtract(idx, delta)]: + assert isinstance(result, DatetimeIndex) + exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], + freq='D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == 'D' + + def test_overflow_offset(self): + # xref https://github.com/statsmodels/statsmodels/issues/3374 + # ends up multiplying really large numbers which overflow + + t = Timestamp('2017-01-13 00:00:00', freq='D') + offset = 20169940 * pd.offsets.Day(1) + + def f(): + t + offset + pytest.raises(OverflowError, f) + + def f(): + offset + t + pytest.raises(OverflowError, f) + + def f(): + t - offset + pytest.raises(OverflowError, f) + + +# GH 10699 +@pytest.mark.parametrize('klass,assert_func', zip([Series, DatetimeIndex], + [tm.assert_series_equal, + tm.assert_index_equal])) +def test_datetime64_with_DateOffset(klass, assert_func): + s = klass(date_range('2000-01-01', '2000-01-31'), name='a') + result = s + pd.DateOffset(years=1) + result2 = pd.DateOffset(years=1) + s + exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') + assert_func(result, exp) + assert_func(result2, exp) + + result = s - pd.DateOffset(years=1) + exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') + assert_func(result, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.Day() + result2 = pd.offsets.Day() + s + exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), + Timestamp('2000-02-16', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + pd.offsets.MonthEnd() + result2 = pd.offsets.MonthEnd() + s + exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), + Timestamp('2000-02-29', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + # array of offsets - valid for Series only + if klass is Series: + with tm.assert_produces_warning(PerformanceWarning): + s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.MonthEnd()]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29') + ]) + assert_func(result, exp) + + # same offset + result = s + Series([pd.offsets.DateOffset(years=1), + pd.offsets.DateOffset(years=1)]) + exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) + assert_func(result, exp) + + s = klass([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-03-31'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31'), + Timestamp('2000-05-15'), + Timestamp('2001-06-15')]) + + # DateOffset relativedelta fastpath + relative_kwargs = [('years', 2), ('months', 5), ('days', 3), + ('hours', 5), ('minutes', 10), ('seconds', 2), + ('microseconds', 5)] + for i, kwd in enumerate(relative_kwargs): + op = pd.DateOffset(**dict([kwd])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + + # assert these are equal on a piecewise basis + offsets = ['YearBegin', ('YearBegin', {'month': 5}), + 'YearEnd', ('YearEnd', {'month': 5}), + 'MonthBegin', 'MonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', + 'Week', ('Week', {'weekday': 3}), + 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', + 'CustomBusinessDay', 'CDay', 'CBMonthEnd', + 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', + 'BusinessHour', 'BYearBegin', 'BYearEnd', + 'BQuarterBegin', ('LastWeekOfMonth', {'weekday': 2}), + ('FY5253Quarter', {'qtr_with_extra_week': 1, + 'startingMonth': 1, + 'weekday': 2, + 'variation': 'nearest'}), + ('FY5253', {'weekday': 0, + 'startingMonth': 2, + 'variation': + 'nearest'}), + ('WeekOfMonth', {'weekday': 2, + 'week': 2}), + 'Easter', ('DateOffset', {'day': 4}), + ('DateOffset', {'month': 5})] + + with warnings.catch_warnings(record=True): + for normalize in (True, False): + for do in offsets: + if isinstance(do, tuple): + do, kwargs = do + else: + do = do + kwargs = {} + + for n in [0, 5]: + if (do in ['WeekOfMonth', 'LastWeekOfMonth', + 'FY5253Quarter', 'FY5253'] and n == 0): + continue + op = getattr(pd.offsets, do)(n, + normalize=normalize, + **kwargs) + assert_func(klass([x + op for x in s]), s + op) + assert_func(klass([x - op for x in s]), s - op) + assert_func(klass([op + x for x in s]), op + s) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 8d9ac59cf9883..88bf8a4024112 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -179,38 +179,6 @@ def test_time_overflow_for_32bit_machines(self): def test_nat(self): assert DatetimeIndex([np.nan])[0] is pd.NaT - def test_ufunc_coercions(self): - idx = date_range('2011-01-01', periods=3, freq='2D', name='x') - - delta = np.timedelta64(1, 'D') - for result in [idx + delta, np.add(idx, delta)]: - assert isinstance(result, DatetimeIndex) - exp = date_range('2011-01-02', periods=3, freq='2D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '2D' - - for result in [idx - delta, np.subtract(idx, delta)]: - assert isinstance(result, DatetimeIndex) - exp = date_range('2010-12-31', periods=3, freq='2D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '2D' - - delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), - np.timedelta64(3, 'D')]) - for result in [idx + delta, np.add(idx, delta)]: - assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], - freq='3D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '3D' - - for result in [idx - delta, np.subtract(idx, delta)]: - assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], - freq='D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == 'D' - def test_week_of_month_frequency(self): # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise d1 = date(2002, 9, 1) @@ -428,25 +396,6 @@ def test_string_index_series_name_converted(self): result = df.T['1/3/2000'] assert result.name == df.index[2] - def test_overflow_offset(self): - # xref https://github.com/statsmodels/statsmodels/issues/3374 - # ends up multiplying really large numbers which overflow - - t = Timestamp('2017-01-13 00:00:00', freq='D') - offset = 20169940 * pd.offsets.Day(1) - - def f(): - t + offset - pytest.raises(OverflowError, f) - - def f(): - offset + t - pytest.raises(OverflowError, f) - - def f(): - t - offset - pytest.raises(OverflowError, f) - def test_get_duplicates(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-03', '2000-01-03', '2000-01-04']) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index b65d467dbd4b8..14217ae291a4c 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -3,15 +3,14 @@ import dateutil import warnings import numpy as np -from datetime import timedelta, datetime +from datetime import datetime from itertools import product import pandas as pd import pandas._libs.tslib as tslib import pandas.util.testing as tm -from pandas.errors import PerformanceWarning -from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, Timedelta, - date_range, TimedeltaIndex, _np_version_under1p10, Index, +from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, + date_range, _np_version_under1p10, Index, bdate_range) from pandas.tseries.offsets import BMonthEnd, CDay, BDay from pandas.tests.test_base import Ops @@ -303,31 +302,31 @@ def test_representation_to_series(self): exp1 = """Series([], dtype: datetime64[ns])""" - exp2 = """0 2011-01-01 -dtype: datetime64[ns]""" + exp2 = ("0 2011-01-01\n" + "dtype: datetime64[ns]") - exp3 = """0 2011-01-01 -1 2011-01-02 -dtype: datetime64[ns]""" + exp3 = ("0 2011-01-01\n" + "1 2011-01-02\n" + "dtype: datetime64[ns]") - exp4 = """0 2011-01-01 -1 2011-01-02 -2 2011-01-03 -dtype: datetime64[ns]""" + exp4 = ("0 2011-01-01\n" + "1 2011-01-02\n" + "2 2011-01-03\n" + "dtype: datetime64[ns]") - exp5 = """0 2011-01-01 09:00:00+09:00 -1 2011-01-01 10:00:00+09:00 -2 2011-01-01 11:00:00+09:00 -dtype: datetime64[ns, Asia/Tokyo]""" + exp5 = ("0 2011-01-01 09:00:00+09:00\n" + "1 2011-01-01 10:00:00+09:00\n" + "2 2011-01-01 11:00:00+09:00\n" + "dtype: datetime64[ns, Asia/Tokyo]") - exp6 = """0 2011-01-01 09:00:00-05:00 -1 2011-01-01 10:00:00-05:00 -2 NaT -dtype: datetime64[ns, US/Eastern]""" + exp6 = ("0 2011-01-01 09:00:00-05:00\n" + "1 2011-01-01 10:00:00-05:00\n" + "2 NaT\n" + "dtype: datetime64[ns, US/Eastern]") - exp7 = """0 2011-01-01 09:00:00 -1 2011-01-02 10:15:00 -dtype: datetime64[ns]""" + exp7 = ("0 2011-01-01 09:00:00\n" + "1 2011-01-02 10:15:00\n" + "dtype: datetime64[ns]") with pd.option_context('display.width', 300): for idx, expected in zip([idx1, idx2, idx3, idx4, @@ -350,17 +349,17 @@ def test_summary(self): idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='US/Eastern') - exp1 = """DatetimeIndex: 0 entries -Freq: D""" + exp1 = ("DatetimeIndex: 0 entries\n" + "Freq: D") - exp2 = """DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01 -Freq: D""" + exp2 = ("DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01\n" + "Freq: D") - exp3 = """DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02 -Freq: D""" + exp3 = ("DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02\n" + "Freq: D") - exp4 = """DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03 -Freq: D""" + exp4 = ("DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03\n" + "Freq: D") exp5 = ("DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " "to 2011-01-01 11:00:00+09:00\n" @@ -406,79 +405,6 @@ def test_union(self): result_union = rng.union(other) tm.assert_index_equal(result_union, expected) - def test_add_iadd(self): - for tz in self.tz: - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - result = rng + delta - expected = pd.date_range('2000-01-01 02:00', - '2000-02-01 02:00', tz=tz) - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - # int - rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, - tz=tz) - result = rng + 1 - expected = pd.date_range('2000-01-01 10:00', freq='H', periods=10, - tz=tz) - tm.assert_index_equal(result, expected) - rng += 1 - tm.assert_index_equal(rng, expected) - - idx = DatetimeIndex(['2011-01-01', '2011-01-02']) - msg = "cannot add DatetimeIndex and Timestamp" - with tm.assert_raises_regex(TypeError, msg): - idx + Timestamp('2011-01-01') - - with tm.assert_raises_regex(TypeError, msg): - Timestamp('2011-01-01') + idx - - @pytest.mark.parametrize('addend', [ - datetime(2011, 1, 1), - DatetimeIndex(['2011-01-01', '2011-01-02']), - DatetimeIndex(['2011-01-01', '2011-01-02']) - .tz_localize('US/Eastern'), - np.datetime64('2011-01-01'), - Timestamp('2011-01-01'), - ]) - def test_add_datetimelike_and_dti(self, addend): - # issue #9631 - - dti = DatetimeIndex(['2011-01-01', '2011-01-02']) - msg = 'cannot add DatetimeIndex and {0}'.format( - type(addend).__name__) - with tm.assert_raises_regex(TypeError, msg): - dti + addend - with tm.assert_raises_regex(TypeError, msg): - addend + dti - - @pytest.mark.parametrize('addend', [ - datetime(2011, 1, 1), - DatetimeIndex(['2011-01-01', '2011-01-02']), - DatetimeIndex(['2011-01-01', '2011-01-02']) - .tz_localize('US/Eastern'), - np.datetime64('2011-01-01'), - Timestamp('2011-01-01'), - ]) - def test_add_datetimelike_and_dti_tz(self, addend): - # issue #9631 - - dti_tz = DatetimeIndex(['2011-01-01', '2011-01-02']) \ - .tz_localize('US/Eastern') - msg = 'cannot add DatetimeIndex and {0}'.format( - type(addend).__name__) - with tm.assert_raises_regex(TypeError, msg): - dti_tz + addend - with tm.assert_raises_regex(TypeError, msg): - addend + dti_tz - def test_difference(self): for tz in self.tz: # diff @@ -500,88 +426,6 @@ def test_difference(self): result_diff = rng.difference(other) tm.assert_index_equal(result_diff, expected) - def test_sub_isub(self): - for tz in self.tz: - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('1999-12-31 22:00', - '2000-01-31 22:00', tz=tz) - - result = rng - delta - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - # int - rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, - tz=tz) - result = rng - 1 - expected = pd.date_range('2000-01-01 08:00', freq='H', periods=10, - tz=tz) - tm.assert_index_equal(result, expected) - rng -= 1 - tm.assert_index_equal(rng, expected) - - def test_sub_dti_dti(self): - # previously performed setop (deprecated in 0.16.0), now changed to - # return subtraction -> TimeDeltaIndex (GH ...) - - dti = date_range('20130101', periods=3) - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') - expected = TimedeltaIndex([0, 0, 0]) - - result = dti - dti - tm.assert_index_equal(result, expected) - - result = dti_tz - dti_tz - tm.assert_index_equal(result, expected) - - with pytest.raises(TypeError): - dti_tz - dti - - with pytest.raises(TypeError): - dti - dti_tz - - with pytest.raises(TypeError): - dti_tz - dti_tz2 - - # isub - dti -= dti - tm.assert_index_equal(dti, expected) - - # different length raises ValueError - dti1 = date_range('20130101', periods=3) - dti2 = date_range('20130101', periods=4) - with pytest.raises(ValueError): - dti1 - dti2 - - # NaN propagation - dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) - dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) - expected = TimedeltaIndex(['1 days', np.nan, np.nan]) - result = dti2 - dti1 - tm.assert_index_equal(result, expected) - - def test_sub_period(self): - # GH 13078 - # not supported, check TypeError - p = pd.Period('2011-01-01', freq='D') - - for freq in [None, 'D']: - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) - - with pytest.raises(TypeError): - idx - p - - with pytest.raises(TypeError): - p - idx - def test_comp_nat(self): left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]) @@ -952,122 +796,6 @@ def test_equals(self): assert not idx.equals(pd.Series(idx3)) -# GH 10699 -@pytest.mark.parametrize('klass,assert_func', zip([Series, DatetimeIndex], - [tm.assert_series_equal, - tm.assert_index_equal])) -def test_datetime64_with_DateOffset(klass, assert_func): - s = klass(date_range('2000-01-01', '2000-01-31'), name='a') - result = s + pd.DateOffset(years=1) - result2 = pd.DateOffset(years=1) + s - exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') - assert_func(result, exp) - assert_func(result2, exp) - - result = s - pd.DateOffset(years=1) - exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') - assert_func(result, exp) - - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')], name='a') - result = s + pd.offsets.Day() - result2 = pd.offsets.Day() + s - exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), - Timestamp('2000-02-16', tz='US/Central')], name='a') - assert_func(result, exp) - assert_func(result2, exp) - - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')], name='a') - result = s + pd.offsets.MonthEnd() - result2 = pd.offsets.MonthEnd() + s - exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')], name='a') - assert_func(result, exp) - assert_func(result2, exp) - - # array of offsets - valid for Series only - if klass is Series: - with tm.assert_produces_warning(PerformanceWarning): - s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) - result = s + Series([pd.offsets.DateOffset(years=1), - pd.offsets.MonthEnd()]) - exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29') - ]) - assert_func(result, exp) - - # same offset - result = s + Series([pd.offsets.DateOffset(years=1), - pd.offsets.DateOffset(years=1)]) - exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) - assert_func(result, exp) - - s = klass([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-03-31'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31'), - Timestamp('2000-05-15'), - Timestamp('2001-06-15')]) - - # DateOffset relativedelta fastpath - relative_kwargs = [('years', 2), ('months', 5), ('days', 3), - ('hours', 5), ('minutes', 10), ('seconds', 2), - ('microseconds', 5)] - for i, kwd in enumerate(relative_kwargs): - op = pd.DateOffset(**dict([kwd])) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - - # assert these are equal on a piecewise basis - offsets = ['YearBegin', ('YearBegin', {'month': 5}), - 'YearEnd', ('YearEnd', {'month': 5}), - 'MonthBegin', 'MonthEnd', - 'SemiMonthEnd', 'SemiMonthBegin', - 'Week', ('Week', {'weekday': 3}), - 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', - 'CustomBusinessDay', 'CDay', 'CBMonthEnd', - 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', - 'BusinessHour', 'BYearBegin', 'BYearEnd', - 'BQuarterBegin', ('LastWeekOfMonth', {'weekday': 2}), - ('FY5253Quarter', {'qtr_with_extra_week': 1, - 'startingMonth': 1, - 'weekday': 2, - 'variation': 'nearest'}), - ('FY5253', {'weekday': 0, - 'startingMonth': 2, - 'variation': - 'nearest'}), - ('WeekOfMonth', {'weekday': 2, - 'week': 2}), - 'Easter', ('DateOffset', {'day': 4}), - ('DateOffset', {'month': 5})] - - with warnings.catch_warnings(record=True): - for normalize in (True, False): - for do in offsets: - if isinstance(do, tuple): - do, kwargs = do - else: - do = do - kwargs = {} - - for n in [0, 5]: - if (do in ['WeekOfMonth', 'LastWeekOfMonth', - 'FY5253Quarter', 'FY5253'] and n == 0): - continue - op = getattr(pd.offsets, do)(n, - normalize=normalize, - **kwargs) - assert_func(klass([x + op for x in s]), s + op) - assert_func(klass([x - op for x in s]), s - op) - assert_func(klass([op + x for x in s]), op + s) - - @pytest.mark.parametrize('years,months', product([-1, 0, 1], [-2, 0, 2])) def test_shift_months(years, months): s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py new file mode 100644 index 0000000000000..66aa5d2db6569 --- /dev/null +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -0,0 +1,435 @@ +# -*- coding: utf-8 -*- +from datetime import timedelta +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import (Timedelta, + period_range, Period, PeriodIndex, + _np_version_under1p10) +import pandas.core.indexes.period as period + + +class TestPeriodIndexArithmetic(object): + def test_add_iadd(self): + rng = pd.period_range('1/1/2000', freq='D', periods=5) + other = pd.period_range('1/6/2000', freq='D', periods=5) + + # previously performed setop union, now raises TypeError (GH14164) + with pytest.raises(TypeError): + rng + other + + with pytest.raises(TypeError): + rng += other + + # offset + # DateOffset + rng = pd.period_range('2014', '2024', freq='A') + result = rng + pd.offsets.YearEnd(5) + expected = pd.period_range('2019', '2029', freq='A') + tm.assert_index_equal(result, expected) + rng += pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), + pd.offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365), Timedelta(days=365)]: + msg = ('Input has different freq(=.+)? ' + 'from PeriodIndex\\(freq=A-DEC\\)') + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng + o + + rng = pd.period_range('2014-01', '2016-12', freq='M') + result = rng + pd.offsets.MonthEnd(5) + expected = pd.period_range('2014-06', '2017-05', freq='M') + tm.assert_index_equal(result, expected) + rng += pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), + pd.offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365), Timedelta(days=365)]: + rng = pd.period_range('2014-01', '2016-12', freq='M') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng + o + + # Tick + offsets = [pd.offsets.Day(3), timedelta(days=3), + np.timedelta64(3, 'D'), pd.offsets.Hour(72), + timedelta(minutes=60 * 24 * 3), np.timedelta64(72, 'h'), + Timedelta('72:00:00')] + for delta in offsets: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + result = rng + delta + expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), + pd.offsets.Minute(), np.timedelta64(4, 'h'), + timedelta(hours=23), Timedelta('23:00:00')]: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng + o + + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), pd.offsets.Minute(120), + timedelta(minutes=120), np.timedelta64(120, 'm'), + Timedelta(minutes=120)] + for delta in offsets: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', + freq='H') + result = rng + delta + expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', + freq='H') + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), + np.timedelta64(30, 's'), Timedelta(seconds=30)]: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', + freq='H') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=H\\)' + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng + delta + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng += delta + + # int + rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + result = rng + 1 + expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + def test_sub(self): + rng = period_range('2007-01', periods=50) + + result = rng - 5 + exp = rng + (-5) + tm.assert_index_equal(result, exp) + + def test_sub_isub(self): + + # previously performed setop, now raises TypeError (GH14164) + # TODO needs to wait on #13077 for decision on result type + rng = pd.period_range('1/1/2000', freq='D', periods=5) + other = pd.period_range('1/6/2000', freq='D', periods=5) + + with pytest.raises(TypeError): + rng - other + + with pytest.raises(TypeError): + rng -= other + + # offset + # DateOffset + rng = pd.period_range('2014', '2024', freq='A') + result = rng - pd.offsets.YearEnd(5) + expected = pd.period_range('2009', '2019', freq='A') + tm.assert_index_equal(result, expected) + rng -= pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), + pd.offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + rng = pd.period_range('2014', '2024', freq='A') + msg = ('Input has different freq(=.+)? ' + 'from PeriodIndex\\(freq=A-DEC\\)') + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng - o + + rng = pd.period_range('2014-01', '2016-12', freq='M') + result = rng - pd.offsets.MonthEnd(5) + expected = pd.period_range('2013-08', '2016-07', freq='M') + tm.assert_index_equal(result, expected) + rng -= pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), + pd.offsets.Minute(), np.timedelta64(365, 'D'), + timedelta(365)]: + rng = pd.period_range('2014-01', '2016-12', freq='M') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng - o + + # Tick + offsets = [pd.offsets.Day(3), timedelta(days=3), + np.timedelta64(3, 'D'), pd.offsets.Hour(72), + timedelta(minutes=60 * 24 * 3), np.timedelta64(72, 'h')] + for delta in offsets: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + result = rng - delta + expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), + pd.offsets.Minute(), np.timedelta64(4, 'h'), + timedelta(hours=23)]: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng - o + + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), pd.offsets.Minute(120), + timedelta(minutes=120), np.timedelta64(120, 'm')] + for delta in offsets: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', + freq='H') + result = rng - delta + expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', + freq='H') + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), + np.timedelta64(30, 's')]: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', + freq='H') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=H\\)' + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng + delta + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + rng += delta + + # int + rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + result = rng - 1 + expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + +class TestPeriodIndexSeriesMethods(object): + """ Test PeriodIndex and Period Series Ops consistency """ + + def _check(self, values, func, expected): + idx = pd.PeriodIndex(values) + result = func(idx) + if isinstance(expected, pd.Index): + tm.assert_index_equal(result, expected) + else: + # comp op results in bool + tm.assert_numpy_array_equal(result, expected) + + s = pd.Series(values) + result = func(s) + + exp = pd.Series(expected, name=values.name) + tm.assert_series_equal(result, exp) + + def test_pi_ops(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + expected = PeriodIndex(['2011-03', '2011-04', + '2011-05', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) + result = idx - Period('2011-01', freq='M') + exp = pd.Index([0, 1, 2, 3], name='idx') + tm.assert_index_equal(result, exp) + + result = Period('2011-01', freq='M') - idx + exp = pd.Index([0, -1, -2, -3], name='idx') + tm.assert_index_equal(result, exp) + + def test_pi_ops_errors(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + s = pd.Series(idx) + + msg = r"unsupported operand type\(s\)" + + for obj in [idx, s]: + for ng in ["str", 1.5]: + with tm.assert_raises_regex(TypeError, msg): + obj + ng + + with pytest.raises(TypeError): + # error message differs between PY2 and 3 + ng + obj + + with tm.assert_raises_regex(TypeError, msg): + obj - ng + + with pytest.raises(TypeError): + np.add(obj, ng) + + if _np_version_under1p10: + assert np.add(ng, obj) is NotImplemented + else: + with pytest.raises(TypeError): + np.add(ng, obj) + + with pytest.raises(TypeError): + np.subtract(obj, ng) + + if _np_version_under1p10: + assert np.subtract(ng, obj) is NotImplemented + else: + with pytest.raises(TypeError): + np.subtract(ng, obj) + + def test_pi_ops_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + expected = PeriodIndex(['2011-03', '2011-04', + 'NaT', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + self._check(idx, lambda x: np.add(x, 2), expected) + + self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx + 2, lambda x: np.subtract(x, 2), idx) + + # freq with mult + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='2M', name='idx') + expected = PeriodIndex(['2011-07', '2011-08', + 'NaT', '2011-10'], freq='2M', name='idx') + self._check(idx, lambda x: x + 3, expected) + self._check(idx, lambda x: 3 + x, expected) + self._check(idx, lambda x: np.add(x, 3), expected) + + self._check(idx + 3, lambda x: x - 3, idx) + self._check(idx + 3, lambda x: np.subtract(x, 3), idx) + + def test_pi_ops_array_int(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + f = lambda x: x + np.array([1, 2, 3, 4]) + exp = PeriodIndex(['2011-02', '2011-04', 'NaT', + '2011-08'], freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: np.add(x, np.array([4, -1, 1, 2])) + exp = PeriodIndex(['2011-05', '2011-01', 'NaT', + '2011-06'], freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: x - np.array([1, 2, 3, 4]) + exp = PeriodIndex(['2010-12', '2010-12', 'NaT', + '2010-12'], freq='M', name='idx') + self._check(idx, f, exp) + + f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) + exp = PeriodIndex(['2010-10', '2010-12', 'NaT', + '2011-06'], freq='M', name='idx') + self._check(idx, f, exp) + + def test_pi_ops_offset(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + f = lambda x: x + pd.offsets.Day() + exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', + '2011-04-02'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x + pd.offsets.Day(2) + exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', + '2011-04-03'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x - pd.offsets.Day(2) + exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', + '2011-03-30'], freq='D', name='idx') + self._check(idx, f, exp) + + def test_pi_offset_errors(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + s = pd.Series(idx) + + # Series op is applied per Period instance, thus error is raised + # from Period + msg_idx = r"Input has different freq from PeriodIndex\(freq=D\)" + msg_s = r"Input cannot be converted to Period\(freq=D\)" + for obj, msg in [(idx, msg_idx), (s, msg_s)]: + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + obj + pd.offsets.Hour(2) + + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + pd.offsets.Hour(2) + obj + + with tm.assert_raises_regex( + period.IncompatibleFrequency, msg): + obj - pd.offsets.Hour(2) + + def test_pi_sub_period(self): + # GH 13071 + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + result = idx - pd.Period('2012-01', freq='M') + exp = pd.Index([-12, -11, -10, -9], name='idx') + tm.assert_index_equal(result, exp) + + result = np.subtract(idx, pd.Period('2012-01', freq='M')) + tm.assert_index_equal(result, exp) + + result = pd.Period('2012-01', freq='M') - idx + exp = pd.Index([12, 11, 10, 9], name='idx') + tm.assert_index_equal(result, exp) + + result = np.subtract(pd.Period('2012-01', freq='M'), idx) + if _np_version_under1p10: + assert result is NotImplemented + else: + tm.assert_index_equal(result, exp) + + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') + tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) + tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + + def test_pi_sub_pdnat(self): + # GH 13071 + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx') + tm.assert_index_equal(pd.NaT - idx, exp) + tm.assert_index_equal(idx - pd.NaT, exp) + + def test_pi_sub_period_nat(self): + # GH 13071 + idx = PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M', name='idx') + + result = idx - pd.Period('2012-01', freq='M') + exp = pd.Index([-12, np.nan, -10, -9], name='idx') + tm.assert_index_equal(result, exp) + + result = pd.Period('2012-01', freq='M') - idx + exp = pd.Index([12, np.nan, 10, 9], name='idx') + tm.assert_index_equal(result, exp) + + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') + tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) + tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 7acc335c31be4..1d77de0d2d8f3 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,14 +1,13 @@ import pytest import numpy as np -from datetime import timedelta import pandas as pd import pandas._libs.tslib as tslib import pandas.util.testing as tm import pandas.core.indexes.period as period from pandas import (DatetimeIndex, PeriodIndex, period_range, Series, Period, - _np_version_under1p10, Index, Timedelta, offsets) + _np_version_under1p10, Index) from pandas.tests.test_base import Ops @@ -286,216 +285,6 @@ def test_resolution(self): idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) assert idx.resolution == expected - def test_add_iadd(self): - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='D', periods=5) - - # previously performed setop union, now raises TypeError (GH14164) - with pytest.raises(TypeError): - rng + other - - with pytest.raises(TypeError): - rng += other - - # offset - # DateOffset - rng = pd.period_range('2014', '2024', freq='A') - result = rng + pd.offsets.YearEnd(5) - expected = pd.period_range('2019', '2029', freq='A') - tm.assert_index_equal(result, expected) - rng += pd.offsets.YearEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365), Timedelta(days=365)]: - msg = ('Input has different freq(=.+)? ' - 'from PeriodIndex\\(freq=A-DEC\\)') - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + o - - rng = pd.period_range('2014-01', '2016-12', freq='M') - result = rng + pd.offsets.MonthEnd(5) - expected = pd.period_range('2014-06', '2017-05', freq='M') - tm.assert_index_equal(result, expected) - rng += pd.offsets.MonthEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365), Timedelta(days=365)]: - rng = pd.period_range('2014-01', '2016-12', freq='M') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + o - - # Tick - offsets = [pd.offsets.Day(3), timedelta(days=3), - np.timedelta64(3, 'D'), pd.offsets.Hour(72), - timedelta(minutes=60 * 24 * 3), np.timedelta64(72, 'h'), - Timedelta('72:00:00')] - for delta in offsets: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - result = rng + delta - expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23), Timedelta('23:00:00')]: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + o - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), pd.offsets.Minute(120), - timedelta(minutes=120), np.timedelta64(120, 'm'), - Timedelta(minutes=120)] - for delta in offsets: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - result = rng + delta - expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', - freq='H') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), - np.timedelta64(30, 's'), Timedelta(seconds=30)]: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=H\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + delta - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng += delta - - # int - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) - result = rng + 1 - expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng += 1 - tm.assert_index_equal(rng, expected) - - def test_sub(self): - rng = period_range('2007-01', periods=50) - - result = rng - 5 - exp = rng + (-5) - tm.assert_index_equal(result, exp) - - def test_sub_isub(self): - - # previously performed setop, now raises TypeError (GH14164) - # TODO needs to wait on #13077 for decision on result type - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='D', periods=5) - - with pytest.raises(TypeError): - rng - other - - with pytest.raises(TypeError): - rng -= other - - # offset - # DateOffset - rng = pd.period_range('2014', '2024', freq='A') - result = rng - pd.offsets.YearEnd(5) - expected = pd.period_range('2009', '2019', freq='A') - tm.assert_index_equal(result, expected) - rng -= pd.offsets.YearEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - rng = pd.period_range('2014', '2024', freq='A') - msg = ('Input has different freq(=.+)? ' - 'from PeriodIndex\\(freq=A-DEC\\)') - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng - o - - rng = pd.period_range('2014-01', '2016-12', freq='M') - result = rng - pd.offsets.MonthEnd(5) - expected = pd.period_range('2013-08', '2016-07', freq='M') - tm.assert_index_equal(result, expected) - rng -= pd.offsets.MonthEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - rng = pd.period_range('2014-01', '2016-12', freq='M') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng - o - - # Tick - offsets = [pd.offsets.Day(3), timedelta(days=3), - np.timedelta64(3, 'D'), pd.offsets.Hour(72), - timedelta(minutes=60 * 24 * 3), np.timedelta64(72, 'h')] - for delta in offsets: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - result = rng - delta - expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng - o - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), pd.offsets.Minute(120), - timedelta(minutes=120), np.timedelta64(120, 'm')] - for delta in offsets: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - result = rng - delta - expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', - freq='H') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), - np.timedelta64(30, 's')]: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=H\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + delta - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng += delta - - # int - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) - result = rng - 1 - expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng -= 1 - tm.assert_index_equal(rng, expected) - def test_comp_nat(self): left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, pd.Period('2011-01-03')]) @@ -869,199 +658,6 @@ def _check(self, values, func, expected): exp = pd.Series(expected, name=values.name) tm.assert_series_equal(result, exp) - def test_pi_ops(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - - expected = PeriodIndex(['2011-03', '2011-04', - '2011-05', '2011-06'], freq='M', name='idx') - self._check(idx, lambda x: x + 2, expected) - self._check(idx, lambda x: 2 + x, expected) - - self._check(idx + 2, lambda x: x - 2, idx) - result = idx - Period('2011-01', freq='M') - exp = pd.Index([0, 1, 2, 3], name='idx') - tm.assert_index_equal(result, exp) - - result = Period('2011-01', freq='M') - idx - exp = pd.Index([0, -1, -2, -3], name='idx') - tm.assert_index_equal(result, exp) - - def test_pi_ops_errors(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - s = pd.Series(idx) - - msg = r"unsupported operand type\(s\)" - - for obj in [idx, s]: - for ng in ["str", 1.5]: - with tm.assert_raises_regex(TypeError, msg): - obj + ng - - with pytest.raises(TypeError): - # error message differs between PY2 and 3 - ng + obj - - with tm.assert_raises_regex(TypeError, msg): - obj - ng - - with pytest.raises(TypeError): - np.add(obj, ng) - - if _np_version_under1p10: - assert np.add(ng, obj) is NotImplemented - else: - with pytest.raises(TypeError): - np.add(ng, obj) - - with pytest.raises(TypeError): - np.subtract(obj, ng) - - if _np_version_under1p10: - assert np.subtract(ng, obj) is NotImplemented - else: - with pytest.raises(TypeError): - np.subtract(ng, obj) - - def test_pi_ops_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - expected = PeriodIndex(['2011-03', '2011-04', - 'NaT', '2011-06'], freq='M', name='idx') - self._check(idx, lambda x: x + 2, expected) - self._check(idx, lambda x: 2 + x, expected) - self._check(idx, lambda x: np.add(x, 2), expected) - - self._check(idx + 2, lambda x: x - 2, idx) - self._check(idx + 2, lambda x: np.subtract(x, 2), idx) - - # freq with mult - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='2M', name='idx') - expected = PeriodIndex(['2011-07', '2011-08', - 'NaT', '2011-10'], freq='2M', name='idx') - self._check(idx, lambda x: x + 3, expected) - self._check(idx, lambda x: 3 + x, expected) - self._check(idx, lambda x: np.add(x, 3), expected) - - self._check(idx + 3, lambda x: x - 3, idx) - self._check(idx + 3, lambda x: np.subtract(x, 3), idx) - - def test_pi_ops_array_int(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - f = lambda x: x + np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2011-02', '2011-04', 'NaT', - '2011-08'], freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: np.add(x, np.array([4, -1, 1, 2])) - exp = PeriodIndex(['2011-05', '2011-01', 'NaT', - '2011-06'], freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: x - np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2010-12', '2010-12', 'NaT', - '2010-12'], freq='M', name='idx') - self._check(idx, f, exp) - - f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) - exp = PeriodIndex(['2010-10', '2010-12', 'NaT', - '2011-06'], freq='M', name='idx') - self._check(idx, f, exp) - - def test_pi_ops_offset(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') - f = lambda x: x + offsets.Day() - exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', - '2011-04-02'], freq='D', name='idx') - self._check(idx, f, exp) - - f = lambda x: x + offsets.Day(2) - exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', - '2011-04-03'], freq='D', name='idx') - self._check(idx, f, exp) - - f = lambda x: x - offsets.Day(2) - exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', - '2011-03-30'], freq='D', name='idx') - self._check(idx, f, exp) - - def test_pi_offset_errors(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') - s = pd.Series(idx) - - # Series op is applied per Period instance, thus error is raised - # from Period - msg_idx = r"Input has different freq from PeriodIndex\(freq=D\)" - msg_s = r"Input cannot be converted to Period\(freq=D\)" - for obj, msg in [(idx, msg_idx), (s, msg_s)]: - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - obj + offsets.Hour(2) - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - offsets.Hour(2) + obj - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - obj - offsets.Hour(2) - - def test_pi_sub_period(self): - # GH 13071 - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - - result = idx - pd.Period('2012-01', freq='M') - exp = pd.Index([-12, -11, -10, -9], name='idx') - tm.assert_index_equal(result, exp) - - result = np.subtract(idx, pd.Period('2012-01', freq='M')) - tm.assert_index_equal(result, exp) - - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12, 11, 10, 9], name='idx') - tm.assert_index_equal(result, exp) - - result = np.subtract(pd.Period('2012-01', freq='M'), idx) - if _np_version_under1p10: - assert result is NotImplemented - else: - tm.assert_index_equal(result, exp) - - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) - - def test_pi_sub_pdnat(self): - # GH 13071 - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx') - tm.assert_index_equal(pd.NaT - idx, exp) - tm.assert_index_equal(idx - pd.NaT, exp) - - def test_pi_sub_period_nat(self): - # GH 13071 - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M', name='idx') - - result = idx - pd.Period('2012-01', freq='M') - exp = pd.Index([-12, np.nan, -10, -9], name='idx') - tm.assert_index_equal(result, exp) - - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12, np.nan, 10, 9], name='idx') - tm.assert_index_equal(result, exp) - - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) - def test_pi_comp_period(self): idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], freq='M', name='idx') diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py new file mode 100644 index 0000000000000..9341cf2202f4c --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -0,0 +1,770 @@ +# -*- coding: utf-8 -*- +import pytest +import numpy as np +from datetime import timedelta +from distutils.version import LooseVersion + +import pandas as pd +import pandas.util.testing as tm +from pandas import (DatetimeIndex, TimedeltaIndex, Float64Index, Int64Index, + to_timedelta, timedelta_range, date_range, + Series, + Timestamp, Timedelta) + + +class TestTimedeltaIndexArithmetic(object): + _holder = TimedeltaIndex + _multiprocess_can_split_ = True + + # TODO: Split by ops, better name + def test_numeric_compat(self): + idx = self._holder(np.arange(5, dtype='int64')) + didx = self._holder(np.arange(5, dtype='int64') ** 2) + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5, dtype='int64') + tm.assert_index_equal(result, + self._holder(np.arange(5, dtype='int64') * 5)) + + result = idx * np.arange(5, dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='float64') + 0.1) + tm.assert_index_equal(result, self._holder(np.arange( + 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) + + # invalid + pytest.raises(TypeError, lambda: idx * idx) + pytest.raises(ValueError, lambda: idx * self._holder(np.arange(3))) + pytest.raises(ValueError, lambda: idx * np.array([1, 2])) + + # FIXME: duplicate. This came from `test_timedelta`, whereas the + # version above came from `test_astype`. Make sure there aren't more + # duplicates. + def test_numeric_compat__(self): + + idx = self._holder(np.arange(5, dtype='int64')) + didx = self._holder(np.arange(5, dtype='int64') ** 2) + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5, dtype='int64') + tm.assert_index_equal(result, + self._holder(np.arange(5, dtype='int64') * 5)) + + result = idx * np.arange(5, dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5, dtype='float64') + 0.1) + tm.assert_index_equal(result, self._holder(np.arange( + 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) + + # invalid + pytest.raises(TypeError, lambda: idx * idx) + pytest.raises(ValueError, lambda: idx * self._holder(np.arange(3))) + pytest.raises(ValueError, lambda: idx * np.array([1, 2])) + + def test_ufunc_coercions(self): + # normal ops are also tested in tseries/test_timedeltas.py + idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], + freq='2H', name='x') + + for result in [idx * 2, np.multiply(idx, 2)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'], + freq='4H', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '4H' + + for result in [idx / 2, np.divide(idx, 2)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'], + freq='H', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == 'H' + + idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], + freq='2H', name='x') + for result in [-idx, np.negative(idx)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'], + freq='-2H', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '-2H' + + idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'], + freq='H', name='x') + for result in [abs(idx), np.absolute(idx)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'], + freq=None, name='x') + tm.assert_index_equal(result, exp) + assert result.freq is None + + def test_add_iadd(self): + # only test adding/sub offsets as + is now numeric + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = timedelta_range('1 days', '10 days') + result = rng + delta + expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', + freq='D') + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + # int + rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + result = rng + 1 + expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + def test_sub_isub(self): + # only test adding/sub offsets as - is now numeric + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = timedelta_range('1 days', '10 days') + result = rng - delta + expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + # int + rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + result = rng - 1 + expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + idx = TimedeltaIndex(['1 day', '2 day']) + msg = "cannot subtract a datelike from a TimedeltaIndex" + with tm.assert_raises_regex(TypeError, msg): + idx - Timestamp('2011-01-01') + + result = Timestamp('2011-01-01') + idx + expected = DatetimeIndex(['2011-01-02', '2011-01-03']) + tm.assert_index_equal(result, expected) + + # TODO: Split by operation, better name + def test_ops_compat(self): + + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + rng = timedelta_range('1 days', '10 days', name='foo') + + # multiply + for offset in offsets: + pytest.raises(TypeError, lambda: rng * offset) + + # divide + expected = Int64Index((np.arange(10) + 1) * 12, name='foo') + for offset in offsets: + result = rng / offset + tm.assert_index_equal(result, expected, exact=False) + + # floor divide + expected = Int64Index((np.arange(10) + 1) * 12, name='foo') + for offset in offsets: + result = rng // offset + tm.assert_index_equal(result, expected, exact=False) + + # divide with nats + rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + expected = Float64Index([12, np.nan, 24], name='foo') + for offset in offsets: + result = rng / offset + tm.assert_index_equal(result, expected) + + # don't allow division by NaT (make could in the future) + pytest.raises(TypeError, lambda: rng / pd.NaT) + + def test_subtraction_ops(self): + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') + td = Timedelta('1 days') + dt = Timestamp('20130101') + + pytest.raises(TypeError, lambda: tdi - dt) + pytest.raises(TypeError, lambda: tdi - dti) + pytest.raises(TypeError, lambda: td - dt) + pytest.raises(TypeError, lambda: td - dti) + + result = dt - dti + expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') + tm.assert_index_equal(result, expected) + + result = dti - dt + expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar') + tm.assert_index_equal(result, expected) + + result = tdi - td + expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo') + tm.assert_index_equal(result, expected, check_names=False) + + result = td - tdi + expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo') + tm.assert_index_equal(result, expected, check_names=False) + + result = dti - td + expected = DatetimeIndex( + ['20121231', '20130101', '20130102'], name='bar') + tm.assert_index_equal(result, expected, check_names=False) + + result = dt - tdi + expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo') + tm.assert_index_equal(result, expected) + + def test_subtraction_ops_with_tz(self): + + # check that dt/dti subtraction ops with tz are validated + dti = date_range('20130101', periods=3) + ts = Timestamp('20130101') + dt = ts.to_pydatetime() + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + ts_tz = Timestamp('20130101').tz_localize('US/Eastern') + ts_tz2 = Timestamp('20130101').tz_localize('CET') + dt_tz = ts_tz.to_pydatetime() + td = Timedelta('1 days') + + def _check(result, expected): + assert result == expected + assert isinstance(result, Timedelta) + + # scalars + result = ts - ts + expected = Timedelta('0 days') + _check(result, expected) + + result = dt_tz - ts_tz + expected = Timedelta('0 days') + _check(result, expected) + + result = ts_tz - dt_tz + expected = Timedelta('0 days') + _check(result, expected) + + # tz mismatches + pytest.raises(TypeError, lambda: dt_tz - ts) + pytest.raises(TypeError, lambda: dt_tz - dt) + pytest.raises(TypeError, lambda: dt_tz - ts_tz2) + pytest.raises(TypeError, lambda: dt - dt_tz) + pytest.raises(TypeError, lambda: ts - dt_tz) + pytest.raises(TypeError, lambda: ts_tz2 - ts) + pytest.raises(TypeError, lambda: ts_tz2 - dt) + pytest.raises(TypeError, lambda: ts_tz - ts_tz2) + + # with dti + pytest.raises(TypeError, lambda: dti - ts_tz) + pytest.raises(TypeError, lambda: dti_tz - ts) + pytest.raises(TypeError, lambda: dti_tz - ts_tz2) + + result = dti_tz - dt_tz + expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + tm.assert_index_equal(result, expected) + + result = dt_tz - dti_tz + expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + tm.assert_index_equal(result, expected) + + result = dti_tz - ts_tz + expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + tm.assert_index_equal(result, expected) + + result = ts_tz - dti_tz + expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + tm.assert_index_equal(result, expected) + + result = td - td + expected = Timedelta('0 days') + _check(result, expected) + + result = dti_tz - td + expected = DatetimeIndex( + ['20121231', '20130101', '20130102'], tz='US/Eastern') + tm.assert_index_equal(result, expected) + + def test_dti_tdi_numeric_ops(self): + # These are normally union/diff set-like ops + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') + + # TODO(wesm): unused? + # td = Timedelta('1 days') + # dt = Timestamp('20130101') + + result = tdi - tdi + expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') + tm.assert_index_equal(result, expected) + + result = tdi + tdi + expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') + tm.assert_index_equal(result, expected) + + result = dti - tdi # name will be reset + expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) + tm.assert_index_equal(result, expected) + + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'H']: + idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + + with pytest.raises(TypeError): + idx - p + + with pytest.raises(TypeError): + p - idx + + def test_addition_ops(self): + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + dti = date_range('20130101', periods=3, name='bar') + td = Timedelta('1 days') + dt = Timestamp('20130101') + + result = tdi + dt + expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + tm.assert_index_equal(result, expected) + + result = dt + tdi + expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + tm.assert_index_equal(result, expected) + + result = td + tdi + expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + tm.assert_index_equal(result, expected) + + result = tdi + td + expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + tm.assert_index_equal(result, expected) + + # unequal length + pytest.raises(ValueError, lambda: tdi + dti[0:1]) + pytest.raises(ValueError, lambda: tdi[0:1] + dti) + + # random indexes + pytest.raises(TypeError, lambda: tdi + Int64Index([1, 2, 3])) + + # this is a union! + # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi) + + result = tdi + dti # name will be reset + expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + tm.assert_index_equal(result, expected) + + result = dti + tdi # name will be reset + expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + tm.assert_index_equal(result, expected) + + result = dt + td + expected = Timestamp('20130102') + assert result == expected + + result = td + dt + expected = Timestamp('20130102') + assert result == expected + + # TODO: Split by op, better name + def test_ops(self): + td = Timedelta(10, unit='d') + assert -td == Timedelta(-10, unit='d') + assert +td == Timedelta(10, unit='d') + assert td - td == Timedelta(0, unit='ns') + assert (td - pd.NaT) is pd.NaT + assert td + td == Timedelta(20, unit='d') + assert (td + pd.NaT) is pd.NaT + assert td * 2 == Timedelta(20, unit='d') + assert (td * pd.NaT) is pd.NaT + assert td / 2 == Timedelta(5, unit='d') + assert td // 2 == Timedelta(5, unit='d') + assert abs(td) == td + assert abs(-td) == td + assert td / td == 1 + assert (td / pd.NaT) is np.nan + assert (td // pd.NaT) is np.nan + + # invert + assert -td == Timedelta('-10d') + assert td * -1 == Timedelta('-10d') + assert -1 * td == Timedelta('-10d') + assert abs(-td) == Timedelta('10d') + + # invalid multiply with another timedelta + pytest.raises(TypeError, lambda: td * td) + + # can't operate with integers + pytest.raises(TypeError, lambda: td + 2) + pytest.raises(TypeError, lambda: td - 2) + + def test_ops_offsets(self): + td = Timedelta(10, unit='d') + assert Timedelta(241, unit='h') == td + pd.offsets.Hour(1) + assert Timedelta(241, unit='h') == pd.offsets.Hour(1) + td + assert 240 == td / pd.offsets.Hour(1) + assert 1 / 240.0 == pd.offsets.Hour(1) / td + assert Timedelta(239, unit='h') == td - pd.offsets.Hour(1) + assert Timedelta(-239, unit='h') == pd.offsets.Hour(1) - td + + def test_ops_ndarray(self): + td = Timedelta('1 day') + + # timedelta, timedelta + other = pd.to_timedelta(['1 day']).values + expected = pd.to_timedelta(['2 days']).values + tm.assert_numpy_array_equal(td + other, expected) + if LooseVersion(np.__version__) >= '1.8': + tm.assert_numpy_array_equal(other + td, expected) + pytest.raises(TypeError, lambda: td + np.array([1])) + pytest.raises(TypeError, lambda: np.array([1]) + td) + + expected = pd.to_timedelta(['0 days']).values + tm.assert_numpy_array_equal(td - other, expected) + if LooseVersion(np.__version__) >= '1.8': + tm.assert_numpy_array_equal(-other + td, expected) + pytest.raises(TypeError, lambda: td - np.array([1])) + pytest.raises(TypeError, lambda: np.array([1]) - td) + + expected = pd.to_timedelta(['2 days']).values + tm.assert_numpy_array_equal(td * np.array([2]), expected) + tm.assert_numpy_array_equal(np.array([2]) * td, expected) + pytest.raises(TypeError, lambda: td * other) + pytest.raises(TypeError, lambda: other * td) + + tm.assert_numpy_array_equal(td / other, + np.array([1], dtype=np.float64)) + if LooseVersion(np.__version__) >= '1.8': + tm.assert_numpy_array_equal(other / td, + np.array([1], dtype=np.float64)) + + # timedelta, datetime + other = pd.to_datetime(['2000-01-01']).values + expected = pd.to_datetime(['2000-01-02']).values + tm.assert_numpy_array_equal(td + other, expected) + if LooseVersion(np.__version__) >= '1.8': + tm.assert_numpy_array_equal(other + td, expected) + + expected = pd.to_datetime(['1999-12-31']).values + tm.assert_numpy_array_equal(-td + other, expected) + if LooseVersion(np.__version__) >= '1.8': + tm.assert_numpy_array_equal(other - td, expected) + + def test_ops_series(self): + # regression test for GH8813 + td = Timedelta('1 day') + other = pd.Series([1, 2]) + expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) + tm.assert_series_equal(expected, td * other) + tm.assert_series_equal(expected, other * td) + + def test_ops_series_object(self): + # GH 13043 + s = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], + name='xxx') + assert s.dtype == object + + exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + # object series & object series + s2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), + pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], + name='xxx') + assert s2.dtype == object + exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], + name='xxx') + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], + name='xxx', dtype=object) + assert s.dtype == object + + exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) + tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) + + def test_ops_notimplemented(self): + class Other: + pass + + other = Other() + + td = Timedelta('1 day') + assert td.__add__(other) is NotImplemented + assert td.__sub__(other) is NotImplemented + assert td.__truediv__(other) is NotImplemented + assert td.__mul__(other) is NotImplemented + assert td.__floordiv__(other) is NotImplemented + + def test_timedelta_ops_scalar(self): + # GH 6808 + base = pd.to_datetime('20130101 09:01:12.123456') + expected_add = pd.to_datetime('20130101 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), + np.timedelta64(10, 's'), + np.timedelta64(10000000000, 'ns'), + pd.offsets.Second(10)]: + result = base + offset + assert result == expected_add + + result = base - offset + assert result == expected_sub + + base = pd.to_datetime('20130102 09:01:12.123456') + expected_add = pd.to_datetime('20130103 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta('1 day, 00:00:10'), + pd.to_timedelta('1 days, 00:00:10'), + timedelta(days=1, seconds=10), + np.timedelta64(1, 'D') + np.timedelta64(10, 's'), + pd.offsets.Day() + pd.offsets.Second(10)]: + result = base + offset + assert result == expected_add + + result = base - offset + assert result == expected_sub + + def test_timedelta_ops_with_missing_values(self): + # setup + s1 = pd.to_timedelta(Series(['00:00:01'])) + s2 = pd.to_timedelta(Series(['00:00:02'])) + sn = pd.to_timedelta(Series([pd.NaT])) + df1 = pd.DataFrame(['00:00:01']).apply(pd.to_timedelta) + df2 = pd.DataFrame(['00:00:02']).apply(pd.to_timedelta) + dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) + scalar1 = pd.to_timedelta('00:00:01') + scalar2 = pd.to_timedelta('00:00:02') + timedelta_NaT = pd.to_timedelta('NaT') + NA = np.nan + + actual = scalar1 + scalar1 + assert actual == scalar2 + actual = scalar2 - scalar1 + assert actual == scalar1 + + actual = s1 + s1 + tm.assert_series_equal(actual, s2) + actual = s2 - s1 + tm.assert_series_equal(actual, s1) + + actual = s1 + scalar1 + tm.assert_series_equal(actual, s2) + actual = scalar1 + s1 + tm.assert_series_equal(actual, s2) + actual = s2 - scalar1 + tm.assert_series_equal(actual, s1) + actual = -scalar1 + s2 + tm.assert_series_equal(actual, s1) + + actual = s1 + timedelta_NaT + tm.assert_series_equal(actual, sn) + actual = timedelta_NaT + s1 + tm.assert_series_equal(actual, sn) + actual = s1 - timedelta_NaT + tm.assert_series_equal(actual, sn) + actual = -timedelta_NaT + s1 + tm.assert_series_equal(actual, sn) + + actual = s1 + NA + tm.assert_series_equal(actual, sn) + actual = NA + s1 + tm.assert_series_equal(actual, sn) + actual = s1 - NA + tm.assert_series_equal(actual, sn) + actual = -NA + s1 + tm.assert_series_equal(actual, sn) + + actual = s1 + pd.NaT + tm.assert_series_equal(actual, sn) + actual = s2 - pd.NaT + tm.assert_series_equal(actual, sn) + + actual = s1 + df1 + tm.assert_frame_equal(actual, df2) + actual = s2 - df1 + tm.assert_frame_equal(actual, df1) + actual = df1 + s1 + tm.assert_frame_equal(actual, df2) + actual = df2 - s1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + df1 + tm.assert_frame_equal(actual, df2) + actual = df2 - df1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + scalar1 + tm.assert_frame_equal(actual, df2) + actual = df2 - scalar1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + timedelta_NaT + tm.assert_frame_equal(actual, dfn) + actual = df1 - timedelta_NaT + tm.assert_frame_equal(actual, dfn) + + actual = df1 + NA + tm.assert_frame_equal(actual, dfn) + actual = df1 - NA + tm.assert_frame_equal(actual, dfn) + + actual = df1 + pd.NaT # NaT is datetime, not timedelta + tm.assert_frame_equal(actual, dfn) + actual = df1 - pd.NaT + tm.assert_frame_equal(actual, dfn) + + def test_add_overflow(self): + # see gh-14068 + msg = "too (big|large) to convert" + with tm.assert_raises_regex(OverflowError, msg): + to_timedelta(106580, 'D') + Timestamp('2000') + with tm.assert_raises_regex(OverflowError, msg): + Timestamp('2000') + to_timedelta(106580, 'D') + + _NaT = int(pd.NaT) + 1 + msg = "Overflow in int64 addition" + with tm.assert_raises_regex(OverflowError, msg): + to_timedelta([106580], 'D') + Timestamp('2000') + with tm.assert_raises_regex(OverflowError, msg): + Timestamp('2000') + to_timedelta([106580], 'D') + with tm.assert_raises_regex(OverflowError, msg): + to_timedelta([_NaT]) - Timedelta('1 days') + with tm.assert_raises_regex(OverflowError, msg): + to_timedelta(['5 days', _NaT]) - Timedelta('1 days') + with tm.assert_raises_regex(OverflowError, msg): + (to_timedelta([_NaT, '5 days', '1 hours']) - + to_timedelta(['7 seconds', _NaT, '4 hours'])) + + # These should not overflow! + exp = TimedeltaIndex([pd.NaT]) + result = to_timedelta([pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(['4 days', pd.NaT]) + result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) + result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + tm.assert_index_equal(result, exp) + + def test_tdi_ops_attributes(self): + rng = timedelta_range('2 days', periods=5, freq='2D', name='x') + + result = rng + 1 + exp = timedelta_range('4 days', periods=5, freq='2D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '2D' + + result = rng - 2 + exp = timedelta_range('-2 days', periods=5, freq='2D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '2D' + + result = rng * 2 + exp = timedelta_range('4 days', periods=5, freq='4D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '4D' + + result = rng / 2 + exp = timedelta_range('1 days', periods=5, freq='D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == 'D' + + result = -rng + exp = timedelta_range('-2 days', periods=5, freq='-2D', name='x') + tm.assert_index_equal(result, exp) + assert result.freq == '-2D' + + rng = pd.timedelta_range('-2 days', periods=5, freq='D', name='x') + + result = abs(rng) + exp = TimedeltaIndex(['2 days', '1 days', '0 days', '1 days', + '2 days'], name='x') + tm.assert_index_equal(result, exp) + assert result.freq is None + + # TODO: Needs more informative name, probably split up into + # more targeted tests + @pytest.mark.parametrize('freq', ['B', 'D']) + def test_timedelta(self, freq): + index = date_range('1/1/2000', periods=50, freq=freq) + + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + tm.assert_index_equal(index, back) + + if freq == 'D': + expected = pd.tseries.offsets.Day(1) + assert index.freq == expected + assert shifted.freq == expected + assert back.freq == expected + else: # freq == 'B' + assert index.freq == pd.tseries.offsets.BusinessDay(1) + assert shifted.freq is None + assert back.freq == pd.tseries.offsets.BusinessDay(1) + + result = index - timedelta(1) + expected = index + timedelta(-1) + tm.assert_index_equal(result, expected) + + # GH4134, buggy with timedeltas + rng = date_range('2013', '2014') + s = Series(rng) + result1 = rng - pd.offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - pd.offsets.Hour(1)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index 586b96f980f8f..7a761cfe30c62 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -5,7 +5,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import (TimedeltaIndex, timedelta_range, Int64Index, Float64Index, - Index, Timedelta, Series) + Index, Timedelta) from ..datetimelike import DatetimeLike @@ -14,6 +14,11 @@ class TestTimedeltaIndex(DatetimeLike): _holder = TimedeltaIndex _multiprocess_can_split_ = True + def test_numeric_compat(self): + # Dummy method to override super's version; this test is now done + # in test_arithmetic.py + pass + def setup_method(self, method): self.indices = dict(index=tm.makeTimedeltaIndex(10)) self.setup_indices() @@ -86,38 +91,3 @@ def test_shift(self): '8 days 01:00:03', '9 days 01:00:03', '10 days 01:00:03'], freq='D') tm.assert_index_equal(result, expected) - - def test_numeric_compat(self): - - idx = self._holder(np.arange(5, dtype='int64')) - didx = self._holder(np.arange(5, dtype='int64') ** 2) - result = idx * 1 - tm.assert_index_equal(result, idx) - - result = 1 * idx - tm.assert_index_equal(result, idx) - - result = idx / 1 - tm.assert_index_equal(result, idx) - - result = idx // 1 - tm.assert_index_equal(result, idx) - - result = idx * np.array(5, dtype='int64') - tm.assert_index_equal(result, - self._holder(np.arange(5, dtype='int64') * 5)) - - result = idx * np.arange(5, dtype='int64') - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='int64')) - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='float64') + 0.1) - tm.assert_index_equal(result, self._holder(np.arange( - 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) - - # invalid - pytest.raises(TypeError, lambda: idx * idx) - pytest.raises(ValueError, lambda: idx * self._holder(np.arange(3))) - pytest.raises(ValueError, lambda: idx * np.array([1, 2])) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index f4f669ee1d087..ff52afea2a918 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -2,15 +2,13 @@ import numpy as np from datetime import timedelta -from distutils.version import LooseVersion import pandas as pd import pandas.util.testing as tm from pandas import to_timedelta -from pandas.util.testing import assert_series_equal, assert_frame_equal -from pandas import (Series, Timedelta, DataFrame, Timestamp, TimedeltaIndex, - timedelta_range, date_range, DatetimeIndex, Int64Index, - _np_version_under1p10, Float64Index, Index) +from pandas import (Series, Timedelta, Timestamp, TimedeltaIndex, + timedelta_range, + _np_version_under1p10, Index) from pandas._libs.tslib import iNaT from pandas.tests.test_base import Ops @@ -194,17 +192,17 @@ def test_summary(self): idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) - exp1 = """TimedeltaIndex: 0 entries -Freq: D""" + exp1 = ("TimedeltaIndex: 0 entries\n" + "Freq: D") - exp2 = """TimedeltaIndex: 1 entries, 1 days to 1 days -Freq: D""" + exp2 = ("TimedeltaIndex: 1 entries, 1 days to 1 days\n" + "Freq: D") - exp3 = """TimedeltaIndex: 2 entries, 1 days to 2 days -Freq: D""" + exp3 = ("TimedeltaIndex: 2 entries, 1 days to 2 days\n" + "Freq: D") - exp4 = """TimedeltaIndex: 3 entries, 1 days to 3 days -Freq: D""" + exp4 = ("TimedeltaIndex: 3 entries, 1 days to 3 days\n" + "Freq: D") exp5 = ("TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days " "00:00:00") @@ -214,289 +212,6 @@ def test_summary(self): result = idx.summary() assert result == expected - def test_add_iadd(self): - - # only test adding/sub offsets as + is now numeric - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = timedelta_range('1 days', '10 days') - result = rng + delta - expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', - freq='D') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - # int - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - result = rng + 1 - expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng += 1 - tm.assert_index_equal(rng, expected) - - def test_sub_isub(self): - # only test adding/sub offsets as - is now numeric - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - for delta in offsets: - rng = timedelta_range('1 days', '10 days') - result = rng - delta - expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - # int - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - result = rng - 1 - expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng -= 1 - tm.assert_index_equal(rng, expected) - - idx = TimedeltaIndex(['1 day', '2 day']) - msg = "cannot subtract a datelike from a TimedeltaIndex" - with tm.assert_raises_regex(TypeError, msg): - idx - Timestamp('2011-01-01') - - result = Timestamp('2011-01-01') + idx - expected = DatetimeIndex(['2011-01-02', '2011-01-03']) - tm.assert_index_equal(result, expected) - - def test_ops_compat(self): - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)] - - rng = timedelta_range('1 days', '10 days', name='foo') - - # multiply - for offset in offsets: - pytest.raises(TypeError, lambda: rng * offset) - - # divide - expected = Int64Index((np.arange(10) + 1) * 12, name='foo') - for offset in offsets: - result = rng / offset - tm.assert_index_equal(result, expected, exact=False) - - # floor divide - expected = Int64Index((np.arange(10) + 1) * 12, name='foo') - for offset in offsets: - result = rng // offset - tm.assert_index_equal(result, expected, exact=False) - - # divide with nats - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - expected = Float64Index([12, np.nan, 24], name='foo') - for offset in offsets: - result = rng / offset - tm.assert_index_equal(result, expected) - - # don't allow division by NaT (make could in the future) - pytest.raises(TypeError, lambda: rng / pd.NaT) - - def test_subtraction_ops(self): - - # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') - - pytest.raises(TypeError, lambda: tdi - dt) - pytest.raises(TypeError, lambda: tdi - dti) - pytest.raises(TypeError, lambda: td - dt) - pytest.raises(TypeError, lambda: td - dti) - - result = dt - dti - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') - tm.assert_index_equal(result, expected) - - result = dti - dt - expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar') - tm.assert_index_equal(result, expected) - - result = tdi - td - expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo') - tm.assert_index_equal(result, expected, check_names=False) - - result = td - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo') - tm.assert_index_equal(result, expected, check_names=False) - - result = dti - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], name='bar') - tm.assert_index_equal(result, expected, check_names=False) - - result = dt - tdi - expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo') - tm.assert_index_equal(result, expected) - - def test_subtraction_ops_with_tz(self): - - # check that dt/dti subtraction ops with tz are validated - dti = date_range('20130101', periods=3) - ts = Timestamp('20130101') - dt = ts.to_pydatetime() - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - ts_tz = Timestamp('20130101').tz_localize('US/Eastern') - ts_tz2 = Timestamp('20130101').tz_localize('CET') - dt_tz = ts_tz.to_pydatetime() - td = Timedelta('1 days') - - def _check(result, expected): - assert result == expected - assert isinstance(result, Timedelta) - - # scalars - result = ts - ts - expected = Timedelta('0 days') - _check(result, expected) - - result = dt_tz - ts_tz - expected = Timedelta('0 days') - _check(result, expected) - - result = ts_tz - dt_tz - expected = Timedelta('0 days') - _check(result, expected) - - # tz mismatches - pytest.raises(TypeError, lambda: dt_tz - ts) - pytest.raises(TypeError, lambda: dt_tz - dt) - pytest.raises(TypeError, lambda: dt_tz - ts_tz2) - pytest.raises(TypeError, lambda: dt - dt_tz) - pytest.raises(TypeError, lambda: ts - dt_tz) - pytest.raises(TypeError, lambda: ts_tz2 - ts) - pytest.raises(TypeError, lambda: ts_tz2 - dt) - pytest.raises(TypeError, lambda: ts_tz - ts_tz2) - - # with dti - pytest.raises(TypeError, lambda: dti - ts_tz) - pytest.raises(TypeError, lambda: dti_tz - ts) - pytest.raises(TypeError, lambda: dti_tz - ts_tz2) - - result = dti_tz - dt_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) - tm.assert_index_equal(result, expected) - - result = dt_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) - tm.assert_index_equal(result, expected) - - result = dti_tz - ts_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) - tm.assert_index_equal(result, expected) - - result = ts_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) - tm.assert_index_equal(result, expected) - - result = td - td - expected = Timedelta('0 days') - _check(result, expected) - - result = dti_tz - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], tz='US/Eastern') - tm.assert_index_equal(result, expected) - - def test_dti_tdi_numeric_ops(self): - - # These are normally union/diff set-like ops - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = date_range('20130101', periods=3, name='bar') - - # TODO(wesm): unused? - # td = Timedelta('1 days') - # dt = Timestamp('20130101') - - result = tdi - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') - tm.assert_index_equal(result, expected) - - result = tdi + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') - tm.assert_index_equal(result, expected) - - result = dti - tdi # name will be reset - expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) - tm.assert_index_equal(result, expected) - - def test_sub_period(self): - # GH 13078 - # not supported, check TypeError - p = pd.Period('2011-01-01', freq='D') - - for freq in [None, 'H']: - idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) - - with pytest.raises(TypeError): - idx - p - - with pytest.raises(TypeError): - p - idx - - def test_addition_ops(self): - - # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') - - result = tdi + dt - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') - tm.assert_index_equal(result, expected) - - result = dt + tdi - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') - tm.assert_index_equal(result, expected) - - result = td + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') - tm.assert_index_equal(result, expected) - - result = tdi + td - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') - tm.assert_index_equal(result, expected) - - # unequal length - pytest.raises(ValueError, lambda: tdi + dti[0:1]) - pytest.raises(ValueError, lambda: tdi[0:1] + dti) - - # random indexes - pytest.raises(TypeError, lambda: tdi + Int64Index([1, 2, 3])) - - # this is a union! - # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi) - - result = tdi + dti # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) - tm.assert_index_equal(result, expected) - - result = dti + tdi # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) - tm.assert_index_equal(result, expected) - - result = dt + td - expected = Timestamp('20130102') - assert result == expected - - result = td + dt - expected = Timestamp('20130102') - assert result == expected - def test_comp_nat(self): left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')]) @@ -864,143 +579,6 @@ def test_equals(self): class TestTimedeltas(object): _multiprocess_can_split_ = True - def test_ops(self): - - td = Timedelta(10, unit='d') - assert -td == Timedelta(-10, unit='d') - assert +td == Timedelta(10, unit='d') - assert td - td == Timedelta(0, unit='ns') - assert (td - pd.NaT) is pd.NaT - assert td + td == Timedelta(20, unit='d') - assert (td + pd.NaT) is pd.NaT - assert td * 2 == Timedelta(20, unit='d') - assert (td * pd.NaT) is pd.NaT - assert td / 2 == Timedelta(5, unit='d') - assert td // 2 == Timedelta(5, unit='d') - assert abs(td) == td - assert abs(-td) == td - assert td / td == 1 - assert (td / pd.NaT) is np.nan - assert (td // pd.NaT) is np.nan - - # invert - assert -td == Timedelta('-10d') - assert td * -1 == Timedelta('-10d') - assert -1 * td == Timedelta('-10d') - assert abs(-td) == Timedelta('10d') - - # invalid multiply with another timedelta - pytest.raises(TypeError, lambda: td * td) - - # can't operate with integers - pytest.raises(TypeError, lambda: td + 2) - pytest.raises(TypeError, lambda: td - 2) - - def test_ops_offsets(self): - td = Timedelta(10, unit='d') - assert Timedelta(241, unit='h') == td + pd.offsets.Hour(1) - assert Timedelta(241, unit='h') == pd.offsets.Hour(1) + td - assert 240 == td / pd.offsets.Hour(1) - assert 1 / 240.0 == pd.offsets.Hour(1) / td - assert Timedelta(239, unit='h') == td - pd.offsets.Hour(1) - assert Timedelta(-239, unit='h') == pd.offsets.Hour(1) - td - - def test_ops_ndarray(self): - td = Timedelta('1 day') - - # timedelta, timedelta - other = pd.to_timedelta(['1 day']).values - expected = pd.to_timedelta(['2 days']).values - tm.assert_numpy_array_equal(td + other, expected) - if LooseVersion(np.__version__) >= '1.8': - tm.assert_numpy_array_equal(other + td, expected) - pytest.raises(TypeError, lambda: td + np.array([1])) - pytest.raises(TypeError, lambda: np.array([1]) + td) - - expected = pd.to_timedelta(['0 days']).values - tm.assert_numpy_array_equal(td - other, expected) - if LooseVersion(np.__version__) >= '1.8': - tm.assert_numpy_array_equal(-other + td, expected) - pytest.raises(TypeError, lambda: td - np.array([1])) - pytest.raises(TypeError, lambda: np.array([1]) - td) - - expected = pd.to_timedelta(['2 days']).values - tm.assert_numpy_array_equal(td * np.array([2]), expected) - tm.assert_numpy_array_equal(np.array([2]) * td, expected) - pytest.raises(TypeError, lambda: td * other) - pytest.raises(TypeError, lambda: other * td) - - tm.assert_numpy_array_equal(td / other, - np.array([1], dtype=np.float64)) - if LooseVersion(np.__version__) >= '1.8': - tm.assert_numpy_array_equal(other / td, - np.array([1], dtype=np.float64)) - - # timedelta, datetime - other = pd.to_datetime(['2000-01-01']).values - expected = pd.to_datetime(['2000-01-02']).values - tm.assert_numpy_array_equal(td + other, expected) - if LooseVersion(np.__version__) >= '1.8': - tm.assert_numpy_array_equal(other + td, expected) - - expected = pd.to_datetime(['1999-12-31']).values - tm.assert_numpy_array_equal(-td + other, expected) - if LooseVersion(np.__version__) >= '1.8': - tm.assert_numpy_array_equal(other - td, expected) - - def test_ops_series(self): - # regression test for GH8813 - td = Timedelta('1 day') - other = pd.Series([1, 2]) - expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) - tm.assert_series_equal(expected, td * other) - tm.assert_series_equal(expected, other * td) - - def test_ops_series_object(self): - # GH 13043 - s = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), - pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], - name='xxx') - assert s.dtype == object - - exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), - pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], - name='xxx') - tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) - tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) - - # object series & object series - s2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), - pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], - name='xxx') - assert s2.dtype == object - exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], - name='xxx') - tm.assert_series_equal(s2 - s, exp) - tm.assert_series_equal(s - s2, -exp) - - s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], - name='xxx', dtype=object) - assert s.dtype == object - - exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], - name='xxx') - tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) - tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) - - def test_ops_notimplemented(self): - class Other: - pass - - other = Other() - - td = Timedelta('1 day') - assert td.__add__(other) is NotImplemented - assert td.__sub__(other) is NotImplemented - assert td.__truediv__(other) is NotImplemented - assert td.__mul__(other) is NotImplemented - assert td.__floordiv__(other) is NotImplemented - def test_ops_error_str(self): # GH 13624 tdi = TimedeltaIndex(['1 day', '2 days']) @@ -1073,126 +651,6 @@ def test_timedelta_ops(self): Timestamp('2015-02-15')]) assert s.diff().median() == timedelta(days=6) - def test_timedelta_ops_scalar(self): - # GH 6808 - base = pd.to_datetime('20130101 09:01:12.123456') - expected_add = pd.to_datetime('20130101 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), - np.timedelta64(10, 's'), - np.timedelta64(10000000000, 'ns'), - pd.offsets.Second(10)]: - result = base + offset - assert result == expected_add - - result = base - offset - assert result == expected_sub - - base = pd.to_datetime('20130102 09:01:12.123456') - expected_add = pd.to_datetime('20130103 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta('1 day, 00:00:10'), - pd.to_timedelta('1 days, 00:00:10'), - timedelta(days=1, seconds=10), - np.timedelta64(1, 'D') + np.timedelta64(10, 's'), - pd.offsets.Day() + pd.offsets.Second(10)]: - result = base + offset - assert result == expected_add - - result = base - offset - assert result == expected_sub - - def test_timedelta_ops_with_missing_values(self): - # setup - s1 = pd.to_timedelta(Series(['00:00:01'])) - s2 = pd.to_timedelta(Series(['00:00:02'])) - sn = pd.to_timedelta(Series([pd.NaT])) - df1 = DataFrame(['00:00:01']).apply(pd.to_timedelta) - df2 = DataFrame(['00:00:02']).apply(pd.to_timedelta) - dfn = DataFrame([pd.NaT]).apply(pd.to_timedelta) - scalar1 = pd.to_timedelta('00:00:01') - scalar2 = pd.to_timedelta('00:00:02') - timedelta_NaT = pd.to_timedelta('NaT') - NA = np.nan - - actual = scalar1 + scalar1 - assert actual == scalar2 - actual = scalar2 - scalar1 - assert actual == scalar1 - - actual = s1 + s1 - assert_series_equal(actual, s2) - actual = s2 - s1 - assert_series_equal(actual, s1) - - actual = s1 + scalar1 - assert_series_equal(actual, s2) - actual = scalar1 + s1 - assert_series_equal(actual, s2) - actual = s2 - scalar1 - assert_series_equal(actual, s1) - actual = -scalar1 + s2 - assert_series_equal(actual, s1) - - actual = s1 + timedelta_NaT - assert_series_equal(actual, sn) - actual = timedelta_NaT + s1 - assert_series_equal(actual, sn) - actual = s1 - timedelta_NaT - assert_series_equal(actual, sn) - actual = -timedelta_NaT + s1 - assert_series_equal(actual, sn) - - actual = s1 + NA - assert_series_equal(actual, sn) - actual = NA + s1 - assert_series_equal(actual, sn) - actual = s1 - NA - assert_series_equal(actual, sn) - actual = -NA + s1 - assert_series_equal(actual, sn) - - actual = s1 + pd.NaT - assert_series_equal(actual, sn) - actual = s2 - pd.NaT - assert_series_equal(actual, sn) - - actual = s1 + df1 - assert_frame_equal(actual, df2) - actual = s2 - df1 - assert_frame_equal(actual, df1) - actual = df1 + s1 - assert_frame_equal(actual, df2) - actual = df2 - s1 - assert_frame_equal(actual, df1) - - actual = df1 + df1 - assert_frame_equal(actual, df2) - actual = df2 - df1 - assert_frame_equal(actual, df1) - - actual = df1 + scalar1 - assert_frame_equal(actual, df2) - actual = df2 - scalar1 - assert_frame_equal(actual, df1) - - actual = df1 + timedelta_NaT - assert_frame_equal(actual, dfn) - actual = df1 - timedelta_NaT - assert_frame_equal(actual, dfn) - - actual = df1 + NA - assert_frame_equal(actual, dfn) - actual = df1 - NA - assert_frame_equal(actual, dfn) - - actual = df1 + pd.NaT # NaT is datetime, not timedelta - assert_frame_equal(actual, dfn) - actual = df1 - pd.NaT - assert_frame_equal(actual, dfn) - def test_compare_timedelta_series(self): # regresssion test for GH5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) @@ -1207,78 +665,3 @@ def test_compare_timedelta_ndarray(self): result = arr[0] > arr expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) - - -class TestSlicing(object): - - def test_tdi_ops_attributes(self): - rng = timedelta_range('2 days', periods=5, freq='2D', name='x') - - result = rng + 1 - exp = timedelta_range('4 days', periods=5, freq='2D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '2D' - - result = rng - 2 - exp = timedelta_range('-2 days', periods=5, freq='2D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '2D' - - result = rng * 2 - exp = timedelta_range('4 days', periods=5, freq='4D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '4D' - - result = rng / 2 - exp = timedelta_range('1 days', periods=5, freq='D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == 'D' - - result = -rng - exp = timedelta_range('-2 days', periods=5, freq='-2D', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '-2D' - - rng = pd.timedelta_range('-2 days', periods=5, freq='D', name='x') - - result = abs(rng) - exp = TimedeltaIndex(['2 days', '1 days', '0 days', '1 days', - '2 days'], name='x') - tm.assert_index_equal(result, exp) - assert result.freq is None - - def test_add_overflow(self): - # see gh-14068 - msg = "too (big|large) to convert" - with tm.assert_raises_regex(OverflowError, msg): - to_timedelta(106580, 'D') + Timestamp('2000') - with tm.assert_raises_regex(OverflowError, msg): - Timestamp('2000') + to_timedelta(106580, 'D') - - _NaT = int(pd.NaT) + 1 - msg = "Overflow in int64 addition" - with tm.assert_raises_regex(OverflowError, msg): - to_timedelta([106580], 'D') + Timestamp('2000') - with tm.assert_raises_regex(OverflowError, msg): - Timestamp('2000') + to_timedelta([106580], 'D') - with tm.assert_raises_regex(OverflowError, msg): - to_timedelta([_NaT]) - Timedelta('1 days') - with tm.assert_raises_regex(OverflowError, msg): - to_timedelta(['5 days', _NaT]) - Timedelta('1 days') - with tm.assert_raises_regex(OverflowError, msg): - (to_timedelta([_NaT, '5 days', '1 hours']) - - to_timedelta(['7 seconds', _NaT, '4 hours'])) - - # These should not overflow! - exp = TimedeltaIndex([pd.NaT]) - result = to_timedelta([pd.NaT]) - Timedelta('1 days') - tm.assert_index_equal(result, exp) - - exp = TimedeltaIndex(['4 days', pd.NaT]) - result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') - tm.assert_index_equal(result, exp) - - exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) - result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + - to_timedelta(['7 seconds', pd.NaT, '4 hours'])) - tm.assert_index_equal(result, exp) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 0a09199eca9d5..2683110f2f02e 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -6,7 +6,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import (timedelta_range, date_range, Series, Timedelta, - DatetimeIndex, TimedeltaIndex, Index, DataFrame, + TimedeltaIndex, Index, DataFrame, Int64Index) from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_index_equal) @@ -27,6 +27,11 @@ def setup_method(self, method): def create_index(self): return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + def test_numeric_compat(self): + # Dummy method to override super's version; this test is now done + # in test_arithmetic.py + pass + def test_shift(self): # test shift for TimedeltaIndex # err8083 @@ -105,81 +110,9 @@ def test_get_indexer(self): tolerance=pd.Timedelta('1 hour')) tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) - def test_numeric_compat(self): - - idx = self._holder(np.arange(5, dtype='int64')) - didx = self._holder(np.arange(5, dtype='int64') ** 2) - result = idx * 1 - tm.assert_index_equal(result, idx) - - result = 1 * idx - tm.assert_index_equal(result, idx) - - result = idx / 1 - tm.assert_index_equal(result, idx) - - result = idx // 1 - tm.assert_index_equal(result, idx) - - result = idx * np.array(5, dtype='int64') - tm.assert_index_equal(result, - self._holder(np.arange(5, dtype='int64') * 5)) - - result = idx * np.arange(5, dtype='int64') - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='int64')) - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='float64') + 0.1) - tm.assert_index_equal(result, self._holder(np.arange( - 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) - - # invalid - pytest.raises(TypeError, lambda: idx * idx) - pytest.raises(ValueError, lambda: idx * self._holder(np.arange(3))) - pytest.raises(ValueError, lambda: idx * np.array([1, 2])) - def test_pickle_compat_construction(self): pass - def test_ufunc_coercions(self): - # normal ops are also tested in tseries/test_timedeltas.py - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') - - for result in [idx * 2, np.multiply(idx, 2)]: - assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'], - freq='4H', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '4H' - - for result in [idx / 2, np.divide(idx, 2)]: - assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'], - freq='H', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == 'H' - - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') - for result in [-idx, np.negative(idx)]: - assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'], - freq='-2H', name='x') - tm.assert_index_equal(result, exp) - assert result.freq == '-2H' - - idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'], - freq='H', name='x') - for result in [abs(idx), np.absolute(idx)]: - assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'], - freq=None, name='x') - tm.assert_index_equal(result, exp) - assert result.freq is None - def test_fillna_timedelta(self): # GH 11343 idx = pd.TimedeltaIndex(['1 day', pd.NaT, '3 day']) @@ -573,40 +506,6 @@ def test_freq_conversion(self): assert_index_equal(result, expected) -class TestSlicing(object): - @pytest.mark.parametrize('freq', ['B', 'D']) - def test_timedelta(self, freq): - index = date_range('1/1/2000', periods=50, freq=freq) - - shifted = index + timedelta(1) - back = shifted + timedelta(-1) - tm.assert_index_equal(index, back) - - if freq == 'D': - expected = pd.tseries.offsets.Day(1) - assert index.freq == expected - assert shifted.freq == expected - assert back.freq == expected - else: # freq == 'B' - assert index.freq == pd.tseries.offsets.BusinessDay(1) - assert shifted.freq is None - assert back.freq == pd.tseries.offsets.BusinessDay(1) - - result = index - timedelta(1) - expected = index + timedelta(-1) - tm.assert_index_equal(result, expected) - - # GH4134, buggy with timedeltas - rng = date_range('2013', '2014') - s = Series(rng) - result1 = rng - pd.offsets.Hour(1) - result2 = DatetimeIndex(s - np.timedelta64(100000000)) - result3 = rng - np.timedelta64(100000000) - result4 = DatetimeIndex(s - pd.offsets.Hour(1)) - tm.assert_index_equal(result1, result4) - tm.assert_index_equal(result2, result3) - - class TestTimeSeries(object): _multiprocess_can_split_ = True diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 1a4d1b1d7abaa..b4ad28eeacb69 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -6,8 +6,7 @@ import pandas as pd import pandas.util.testing as tm from pandas.util.testing import assert_series_equal -from pandas import (Series, Timedelta, to_timedelta, isna, - TimedeltaIndex) +from pandas import Series, to_timedelta, isna, TimedeltaIndex from pandas._libs.tslib import iNaT @@ -174,28 +173,3 @@ def test_to_timedelta_on_missing_values(self): actual = pd.to_timedelta(pd.NaT) assert actual.value == timedelta_NaT.astype('int64') - - def test_to_timedelta_on_nanoseconds(self): - # GH 9273 - result = Timedelta(nanoseconds=100) - expected = Timedelta('100ns') - assert result == expected - - result = Timedelta(days=1, hours=1, minutes=1, weeks=1, seconds=1, - milliseconds=1, microseconds=1, nanoseconds=1) - expected = Timedelta(694861001001001) - assert result == expected - - result = Timedelta(microseconds=1) + Timedelta(nanoseconds=1) - expected = Timedelta('1us1ns') - assert result == expected - - result = Timedelta(microseconds=1) - Timedelta(nanoseconds=1) - expected = Timedelta('999ns') - assert result == expected - - result = Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2) - expected = Timedelta('990ns') - assert result == expected - - pytest.raises(TypeError, lambda: Timedelta(nanoseconds='abc')) diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index b5a8ce24fa4f8..d4434b3af385b 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -12,6 +12,35 @@ from pandas._libs.tslib import iNaT, NaT +class TestTimedeltaArithmetic(object): + _multiprocess_can_split_ = True + + def test_to_timedelta_on_nanoseconds(self): + # GH 9273 + result = Timedelta(nanoseconds=100) + expected = Timedelta('100ns') + assert result == expected + + result = Timedelta(days=1, hours=1, minutes=1, weeks=1, seconds=1, + milliseconds=1, microseconds=1, nanoseconds=1) + expected = Timedelta(694861001001001) + assert result == expected + + result = Timedelta(microseconds=1) + Timedelta(nanoseconds=1) + expected = Timedelta('1us1ns') + assert result == expected + + result = Timedelta(microseconds=1) - Timedelta(nanoseconds=1) + expected = Timedelta('999ns') + assert result == expected + + result = Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2) + expected = Timedelta('990ns') + assert result == expected + + pytest.raises(TypeError, lambda: Timedelta(nanoseconds='abc')) + + class TestTimedeltas(object): _multiprocess_can_split_ = True From 88bf0012adf471f6615934840cc20c8f7e09414f Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 1 Nov 2017 01:23:09 +0000 Subject: [PATCH 06/44] Adding skip to test failing because of lxml import (#17747) (#17748) --- pandas/tests/io/test_html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 8dfae2733ef20..b029403435d6f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -955,6 +955,7 @@ def test_importcheck_thread_safety(): # see gh-16928 # force import check by reinitalising global vars in html.py + pytest.importorskip('lxml') reload(pandas.io.html) filename = os.path.join(DATA_PATH, 'valid_markup.html') From 7d8c9ab0f307bbeaf93a150bb7f5923411269c81 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Nov 2017 03:28:17 -0700 Subject: [PATCH 07/44] a zillion flakes (#18046) --- ci/lint.sh | 4 +- pandas/_libs/algos.pyx | 5 +- pandas/_libs/groupby.pyx | 2 +- pandas/_libs/hashing.pyx | 8 +- pandas/_libs/index.pyx | 7 +- pandas/_libs/interval.pyx | 1 + pandas/_libs/join.pyx | 2 +- pandas/_libs/lib.pyx | 23 ++-- pandas/_libs/parsers.pyx | 22 ++-- pandas/_libs/period.pyx | 34 +++--- pandas/_libs/sparse.pyx | 6 +- pandas/_libs/src/inference.pyx | 4 +- pandas/_libs/src/reduce.pyx | 6 +- pandas/_libs/tslib.pyx | 49 ++++---- pandas/_libs/tslibs/conversion.pyx | 11 +- pandas/_libs/tslibs/fields.pyx | 14 +-- pandas/_libs/tslibs/frequencies.pyx | 181 ++++++++++++++-------------- pandas/_libs/tslibs/parsing.pyx | 20 +-- pandas/_libs/tslibs/strptime.pyx | 28 +++-- pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/_libs/tslibs/timezones.pyx | 7 +- pandas/_libs/window.pyx | 4 +- pandas/io/msgpack/_packer.pyx | 2 +- pandas/io/msgpack/_unpacker.pyx | 12 +- pandas/tests/io/test_feather.py | 16 +-- pandas/tests/io/test_pytables.py | 51 ++++---- pandas/tseries/frequencies.py | 2 +- setup.cfg | 2 +- setup.py | 19 ++- 29 files changed, 288 insertions(+), 256 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 22f8628f59dcd..43d6ea0c118b0 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -24,7 +24,7 @@ if [ "$LINT" ]; then echo "Linting setup.py DONE" echo "Linting *.pyx" - flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126 + flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403 if [ $? -ne "0" ]; then RET=1 fi @@ -34,7 +34,7 @@ if [ "$LINT" ]; then for path in 'src' do echo "linting -> pandas/$path" - flake8 pandas/$path --filename=*.pxi.in --select=E501,E302,E203,E111,E114,E221,E303,E231,E126 + flake8 pandas/$path --filename=*.pxi.in --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 if [ $? -ne "0" ]; then RET=1 fi diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a44a7288bda45..e9ef9c4ffe24b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -258,7 +258,7 @@ def min_subseq(ndarray[double_t] arr): return (s, e, -m) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Pairwise correlation/covariance @@ -322,7 +322,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None): return result -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Pairwise Spearman correlation @@ -386,6 +386,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): return result + # generated from template include "algos_common_helper.pxi" include "algos_rank_helper.pxi" diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1cb7b18fa4f61..2fbbc81c4b5a1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -25,7 +25,7 @@ cdef double nan = NaN # TODO: aggregate multiple columns in single pass -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # first, nth, last diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 06ed947808e39..53203dd30daee 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -93,22 +93,26 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): free(lens) return result + cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil: return (x << b) | (x >> (64 - b)) + cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil: p[0] = (v) p[1] = (v >> 8) p[2] = (v >> 16) p[3] = (v >> 24) + cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil: u32to8_le(p, v) u32to8_le(p + 4, (v >> 32)) + cdef inline uint64_t u8to64_le(uint8_t* p) nogil: return (p[0] | - p[1] << 8 | + p[1] << 8 | p[2] << 16 | p[3] << 24 | p[4] << 32 | @@ -116,6 +120,7 @@ cdef inline uint64_t u8to64_le(uint8_t* p) nogil: p[6] << 48 | p[7] << 56) + cdef inline void _sipround(uint64_t* v0, uint64_t* v1, uint64_t* v2, uint64_t* v3) nogil: v0[0] += v1[0] @@ -133,6 +138,7 @@ cdef inline void _sipround(uint64_t* v0, uint64_t* v1, v1[0] ^= v2[0] v2[0] = _rotl(v2[0], 32) + cpdef uint64_t siphash(bytes data, bytes key) except? 0: if len(key) != 16: raise ValueError( diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c96251a0293d6..e98c0131e9c44 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -122,7 +122,7 @@ cdef class IndexEngine: if not self.is_unique: return self._get_loc_duplicates(val) values = self._get_index_values() - loc = _bin_search(values, val) # .searchsorted(val, side='left') + loc = _bin_search(values, val) # .searchsorted(val, side='left') if loc >= len(values): raise KeyError(val) if util.get_value_at(values, loc) != val: @@ -475,15 +475,14 @@ cdef class DatetimeEngine(Int64Engine): if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') - return algos.pad_int64(self._get_index_values(), other, - limit=limit) + return algos.pad_int64(self._get_index_values(), other, limit=limit) def get_backfill_indexer(self, other, limit=None): if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') return algos.backfill_int64(self._get_index_values(), other, - limit=limit) + limit=limit) cdef class TimedeltaEngine(DatetimeEngine): diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 264a983fe4d53..c09642511207a 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -13,6 +13,7 @@ from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, import numbers _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) + cdef class IntervalMixin: property closed_left: def __get__(self): diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 33c3650fa0425..8dbc70a0bdbe9 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -147,7 +147,7 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, - Py_ssize_t max_groups): + Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 ndarray[int64_t] left_count, right_count, left_sorter, right_sorter diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b4687df8785dd..f882c3d7a7621 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -82,6 +82,7 @@ def values_from_object(object o): return o + cpdef map_indices_list(list index): """ Produce a dict mapping the values of the input array to their respective @@ -116,7 +117,8 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): s += arr[i].__sizeof__() return s -#---------------------------------------------------------------------- + +# ---------------------------------------------------------------------- # isnull / notnull related cdef double INF = np.inf @@ -125,7 +127,7 @@ cdef double NEGINF = -INF cpdef bint checknull(object val): if util.is_float_object(val) or util.is_complex_object(val): - return val != val # and val != INF and val != NEGINF + return val != val # and val != INF and val != NEGINF elif util.is_datetime64_object(val): return get_datetime64_value(val) == NPY_NAT elif val is NaT: @@ -990,7 +992,7 @@ def convert_json_to_lines(object arr): in_quotes = ~in_quotes if v == backslash or is_escaping: is_escaping = ~is_escaping - if v == comma: # commas that should be \n + if v == comma: # commas that should be \n if num_open_brackets_seen == 0 and not in_quotes: narr[i] = newline elif v == left_bracket: @@ -1015,7 +1017,7 @@ def write_csv_rows(list data, ndarray data_index, # In crude testing, N>100 yields little marginal improvement N=100 - # pre-allocate rows + # pre-allocate rows ncols = len(cols) rows = [[None] * (nlevels + ncols) for x in range(N)] @@ -1047,12 +1049,13 @@ def write_csv_rows(list data, ndarray data_index, if j >= N - 1 and j % N == N - 1: writer.writerows(rows) - if j >= 0 and (j < N - 1 or (j % N) != N - 1): + if j >= 0 and (j < N - 1 or (j % N) != N - 1): writer.writerows(rows[:((j + 1) % N)]) -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # Groupby-related functions + @cython.boundscheck(False) def arrmap(ndarray[object] index, object func): cdef int length = index.shape[0] @@ -1136,7 +1139,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, bins = np.empty(lenbin - 1, dtype=np.int64) j = 0 # index into values - bc = 0 # bin count + bc = 0 # bin count # linear scan if right_closed: @@ -1285,9 +1288,9 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, cdef class _PandasNull: def __richcmp__(_PandasNull self, object other, int op): - if op == 2: # == + if op == 2: # == return isinstance(other, _PandasNull) - elif op == 3: # != + elif op == 3: # != return not isinstance(other, _PandasNull) else: return False @@ -1793,7 +1796,7 @@ cdef class BlockPlacement: stop += other_int if ((step > 0 and start < 0) or - (step < 0 and stop < step)): + (step < 0 and stop < step)): raise ValueError("iadd causes length change") if stop < 0: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 0dacdf70a71d5..a5ce6c560d844 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -138,7 +138,7 @@ cdef extern from "parser/tokenizer.h": # Store words in (potentially ragged) matrix for now, hmm char **words - int64_t *word_starts # where we are in the stream + int64_t *word_starts # where we are in the stream int64_t words_len int64_t words_cap @@ -400,7 +400,7 @@ cdef class TextReader: raise ValueError('only length-1 separators excluded right now') self.parser.delimiter = ord(delimiter) - #---------------------------------------- + # ---------------------------------------- # parser options self.parser.doublequote = doublequote @@ -519,7 +519,7 @@ cdef class TextReader: self.index_col = index_col - #---------------------------------------- + # ---------------------------------------- # header stuff self.allow_leading_cols = allow_leading_cols @@ -810,7 +810,7 @@ cdef class TextReader: if hr == self.header[-1]: lc = len(this_header) ic = (len(self.index_col) if self.index_col - is not None else 0) + is not None else 0) if lc != unnamed_count and lc - ic > unnamed_count: hr -= 1 self.parser_start -= 1 @@ -848,7 +848,7 @@ cdef class TextReader: # Corner case, not enough lines in the file if self.parser.lines < data_line + 1: field_count = len(header[0]) - else: # not self.has_usecols: + else: # not self.has_usecols: field_count = self.parser.line_fields[data_line] @@ -1374,6 +1374,7 @@ def _ensure_encoded(list lst): result.append(x) return result + cdef asbytes(object o): if PY3: return str(o).encode('utf-8') @@ -1417,11 +1418,13 @@ def _maybe_upcast(arr): return arr + cdef enum StringPath: CSTRING UTF8 ENCODED + # factored out logic to pick string converter cdef inline StringPath _string_path(char *encoding): if encoding != NULL and encoding != b"utf-8": @@ -1430,9 +1433,12 @@ cdef inline StringPath _string_path(char *encoding): return UTF8 else: return CSTRING + + # ---------------------------------------------------------------------- # Type conversions / inference support code + cdef _string_box_factorize(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): @@ -1782,7 +1788,7 @@ cdef inline int _try_double_nogil(parser_t *parser, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF @@ -1803,7 +1809,7 @@ cdef inline int _try_double_nogil(parser_t *parser, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF @@ -2263,6 +2269,7 @@ def _compute_na_values(): } return na_values + na_values = _compute_na_values() for k in list(na_values): @@ -2362,6 +2369,7 @@ def _to_structured_array(dict columns, object names, object usecols): return recs + cdef _fill_structured_column(char *dst, char* src, int64_t elsize, int64_t stride, int64_t length, bint incref): cdef: diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 4b8c86ae9d4b2..0456033dbb731 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -109,8 +109,8 @@ cdef extern from "period_helper.h": initialize_daytime_conversion_factor_matrix() +# ---------------------------------------------------------------------- # Period logic -#---------------------------------------------------------------------- @cython.wraparound(False) @@ -168,9 +168,11 @@ def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): return out + cdef char START = 'S' cdef char END = 'E' + cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, bint end): """ @@ -278,31 +280,31 @@ def period_format(int64_t value, int freq, object fmt=None): if fmt is None: freq_group = (freq // 1000) * 1000 - if freq_group == 1000: # FR_ANN + if freq_group == 1000: # FR_ANN fmt = b'%Y' - elif freq_group == 2000: # FR_QTR + elif freq_group == 2000: # FR_QTR fmt = b'%FQ%q' - elif freq_group == 3000: # FR_MTH + elif freq_group == 3000: # FR_MTH fmt = b'%Y-%m' - elif freq_group == 4000: # WK + elif freq_group == 4000: # WK left = period_asfreq(value, freq, 6000, 0) right = period_asfreq(value, freq, 6000, 1) return '%s/%s' % (period_format(left, 6000), period_format(right, 6000)) - elif (freq_group == 5000 # BUS - or freq_group == 6000): # DAY + elif (freq_group == 5000 # BUS + or freq_group == 6000): # DAY fmt = b'%Y-%m-%d' - elif freq_group == 7000: # HR + elif freq_group == 7000: # HR fmt = b'%Y-%m-%d %H:00' - elif freq_group == 8000: # MIN + elif freq_group == 8000: # MIN fmt = b'%Y-%m-%d %H:%M' - elif freq_group == 9000: # SEC + elif freq_group == 9000: # SEC fmt = b'%Y-%m-%d %H:%M:%S' - elif freq_group == 10000: # MILLISEC + elif freq_group == 10000: # MILLISEC fmt = b'%Y-%m-%d %H:%M:%S.%l' - elif freq_group == 11000: # MICROSEC + elif freq_group == 11000: # MICROSEC fmt = b'%Y-%m-%d %H:%M:%S.%u' - elif freq_group == 12000: # NANOSEC + elif freq_group == 12000: # NANOSEC fmt = b'%Y-%m-%d %H:%M:%S.%n' else: raise ValueError('Unknown freq: %d' % freq) @@ -730,7 +732,7 @@ cdef class _Period(object): return Period(ordinal=ordinal, freq=self.freq) msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - else: # pragma no cover + else: # pragma no cover return NotImplemented def __add__(self, other): @@ -1148,8 +1150,8 @@ class Period(_Period): elif value is None: if (year is None and month is None and - quarter is None and day is None and - hour is None and minute is None and second is None): + quarter is None and day is None and + hour is None and minute is None and second is None): ordinal = iNaT else: if freq is None: diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index fac678e531c8b..5484cbda5bdf9 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -328,7 +328,7 @@ cdef class BlockIndex(SparseIndex): ndarray blocs, blengths cdef: - object __weakref__ # need to be picklable + object __weakref__ # need to be picklable int32_t *locbuf int32_t *lenbuf @@ -486,7 +486,7 @@ cdef class BlockIndex(SparseIndex): cur_length = xlen[xi] xi += 1 - else: # xloc[xi] < yloc[yi] + else: # xloc[xi] < yloc[yi] cur_loc = yloc[yi] diff = yloc[yi] - xloc[xi] @@ -629,7 +629,7 @@ cdef class BlockMerge(object): cdef: BlockIndex x, y, result ndarray xstart, xlen, xend, ystart, ylen, yend - int32_t xi, yi # block indices + int32_t xi, yi # block indices def __init__(self, BlockIndex x, BlockIndex y): self.x = x diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 8fab825eae428..5d550148b10bc 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -448,8 +448,8 @@ def infer_dtype(object value, bint skipna=False): for i in range(n): val = util.get_value_1d(values, i) if (util.is_integer_object(val) and - not util.is_timedelta64_object(val) and - not util.is_datetime64_object(val)): + not util.is_timedelta64_object(val) and + not util.is_datetime64_object(val)): return 'mixed-integer' return 'mixed' diff --git a/pandas/_libs/src/reduce.pyx b/pandas/_libs/src/reduce.pyx index f578eb2f4a346..d1761384114ef 100644 --- a/pandas/_libs/src/reduce.pyx +++ b/pandas/_libs/src/reduce.pyx @@ -1,4 +1,5 @@ -#cython=False +# -*- coding: utf-8 -*- +# cython: profile=False import numpy as np from distutils.version import LooseVersion @@ -512,7 +513,7 @@ def apply_frame_axis0(object frame, object f, object names, for i in range(n): slider.move(starts[i], ends[i]) - item_cache.clear() # ugh + item_cache.clear() # ugh object.__setattr__(slider.dummy, 'name', names[i]) piece = f(slider.dummy) @@ -532,6 +533,7 @@ def apply_frame_axis0(object frame, object f, object names, return results, mutated + cdef class BlockSlider: """ Only capable of sliding on axis=0 diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 025533b29366f..d2492064c900c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -80,7 +80,7 @@ UTC = pytz.utc # initialize numpy import_array() -# import_ufunc() + cdef int64_t NPY_NAT = util.get_nat() iNaT = NPY_NAT @@ -421,7 +421,7 @@ class Timestamp(_Timestamp): def _round(self, freq, rounder): cdef: - int64_t unit, r, value, buff = 1000000 + int64_t unit, r, value, buff = 1000000 object result from pandas.tseries.frequencies import to_offset @@ -620,7 +620,7 @@ class Timestamp(_Timestamp): # tz naive, localize tz = maybe_get_tz(tz) if not is_string_object(ambiguous): - ambiguous = [ambiguous] + ambiguous = [ambiguous] value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz, ambiguous=ambiguous, errors=errors)[0] return Timestamp(value, tz=tz) @@ -809,6 +809,7 @@ class Timestamp(_Timestamp): # ---------------------------------------------------------------------- + cdef inline bint _check_all_nulls(object val): """ utility to check if a value is any type of null """ cdef bint res @@ -1040,7 +1041,7 @@ cdef class _Timestamp(datetime): if self.tzinfo is None: if other.tzinfo is not None: raise TypeError('Cannot compare tz-naive and tz-aware ' - 'timestamps') + 'timestamps') elif other.tzinfo is None: raise TypeError('Cannot compare tz-naive and tz-aware timestamps') @@ -1210,10 +1211,10 @@ cdef class _Timestamp(datetime): # format a Timestamp with only _date_repr if possible # otherwise _repr_base if (self.hour == 0 and - self.minute == 0 and - self.second == 0 and - self.microsecond == 0 and - self.nanosecond == 0): + self.minute == 0 and + self.second == 0 and + self.microsecond == 0 and + self.nanosecond == 0): return self._date_repr return self._repr_base @@ -1332,8 +1333,7 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, # sort of a temporary hack if ts.tzinfo is not None: - if (hasattr(tz, 'normalize') and - hasattr(ts.tzinfo, '_utcoffset')): + if hasattr(tz, 'normalize') and hasattr(ts.tzinfo, '_utcoffset'): ts = tz.normalize(ts) obj.value = pydatetime_to_dt64(ts, &obj.dts) obj.tzinfo = ts.tzinfo @@ -1682,7 +1682,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): if not need_to_iterate: if ((fvalues < _NS_LOWER_BOUND).any() - or (fvalues > _NS_UPPER_BOUND).any()): + or (fvalues > _NS_UPPER_BOUND).any()): raise OutOfBoundsDatetime( "cannot convert input with unit '{0}'".format(unit)) result = (iresult *m).astype('M8[ns]') @@ -1950,7 +1950,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', raise TypeError("{0} is not convertible to datetime" .format(type(val))) - if seen_datetime and seen_integer: + if seen_datetime and seen_integer: # we have mixed datetimes & integers if is_coerce: @@ -2027,9 +2027,9 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', cdef class _Timedelta(timedelta): cdef readonly: - int64_t value # nanoseconds - object freq # frequency reference - bint is_populated # are my components populated + int64_t value # nanoseconds + object freq # frequency reference + bint is_populated # are my components populated int64_t _sign, _d, _h, _m, _s, _ms, _us, _ns def __hash__(_Timedelta self): @@ -2190,20 +2190,20 @@ class Timedelta(_Timedelta): if value is _no_input: if not len(kwargs): - raise ValueError( - "cannot construct a Timedelta without a value/unit or " - "descriptive keywords (days,seconds....)") + raise ValueError("cannot construct a Timedelta without a " + "value/unit or descriptive keywords " + "(days,seconds....)") def _to_py_int_float(v): if is_integer_object(v): return int(v) elif is_float_object(v): return float(v) - raise TypeError( - "Invalid type {0}. Must be int or float.".format(type(v))) + raise TypeError("Invalid type {0}. Must be int or " + "float.".format(type(v))) kwargs = dict([(k, _to_py_int_float(v)) - for k, v in iteritems(kwargs)]) + for k, v in iteritems(kwargs)]) try: nano = kwargs.pop('nanoseconds', 0) @@ -2233,9 +2233,8 @@ class Timedelta(_Timedelta): elif _checknull_with_nat(value): return NaT else: - raise ValueError( - "Value must be Timedelta, string, integer, " - "float, timedelta or convertible") + raise ValueError("Value must be Timedelta, string, integer, " + "float, timedelta or convertible") if is_timedelta64_object(value): value = value.view('i8') @@ -2389,6 +2388,7 @@ class Timedelta(_Timedelta): def __repr__(self): return "Timedelta('{0}')".format(self._repr_base(format='long')) + def __str__(self): return self._repr_base(format='long') @@ -2674,6 +2674,7 @@ class Timedelta(_Timedelta): __pos__ = _op_unary_method(lambda x: x, '__pos__') __abs__ = _op_unary_method(lambda x: abs(x), '__abs__') + # resolution in ns Timedelta.min = Timedelta(np.iinfo(np.int64).min +1) Timedelta.max = Timedelta(np.iinfo(np.int64).max) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index fe729594526ef..478d3bba80b00 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -206,8 +206,8 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): dt64_to_dtstruct(v, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz1) - delta = (int(get_utcoffset(tz1, dt).total_seconds()) - * 1000000000) + delta = (int(get_utcoffset(tz1, dt).total_seconds()) * + 1000000000) utc_dates[i] = v - delta else: trans, deltas, typ = get_dst_info(tz1) @@ -246,8 +246,8 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): dt64_to_dtstruct(v, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = (int(get_utcoffset(tz2, dt).total_seconds()) - * 1000000000) + delta = (int(get_utcoffset(tz2, dt).total_seconds()) * + 1000000000) result[i] = v + delta return result @@ -414,7 +414,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, raise pytz.AmbiguousTimeError( "There are %i dst switches when " "there should only be 1." % switch_idx.size) - switch_idx = switch_idx[0] + 1 # Pull the only index and adjust + switch_idx = switch_idx[0] + 1 + # Pull the only index and adjust a_idx = grp[:switch_idx] b_idx = grp[switch_idx:] dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 53ed8ddf22f4b..b40646295cce5 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -40,13 +40,13 @@ def build_field_sarray(ndarray[int64_t] dtindex): count = len(dtindex) - sa_dtype = [('Y', 'i4'), # year - ('M', 'i4'), # month - ('D', 'i4'), # day - ('h', 'i4'), # hour - ('m', 'i4'), # min - ('s', 'i4'), # second - ('u', 'i4')] # microsecond + sa_dtype = [('Y', 'i4'), # year + ('M', 'i4'), # month + ('D', 'i4'), # day + ('h', 'i4'), # hour + ('m', 'i4'), # min + ('s', 'i4'), # second + ('u', 'i4')] # microsecond out = np.empty(count, dtype=sa_dtype) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index f7889d76abbc7..9d810bfb411af 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -10,89 +10,15 @@ np.import_array() from util cimport is_integer_object - -cpdef get_freq_code(freqstr): - """ - Return freq str or tuple to freq code and stride (mult) - - Parameters - ---------- - freqstr : str or tuple - - Returns - ------- - return : tuple of base frequency code and stride (mult) - - Example - ------- - >>> get_freq_code('3D') - (6000, 3) - - >>> get_freq_code('D') - (6000, 1) - - >>> get_freq_code(('D', 3)) - (6000, 3) - """ - if getattr(freqstr, '_typ', None) == 'dateoffset': - freqstr = (freqstr.rule_code, freqstr.n) - - if isinstance(freqstr, tuple): - if (is_integer_object(freqstr[0]) and - is_integer_object(freqstr[1])): - # e.g., freqstr = (2000, 1) - return freqstr - else: - # e.g., freqstr = ('T', 5) - try: - code = _period_str_to_code(freqstr[0]) - stride = freqstr[1] - except: - if is_integer_object(freqstr[1]): - raise - code = _period_str_to_code(freqstr[1]) - stride = freqstr[0] - return code, stride - - if is_integer_object(freqstr): - return (freqstr, 1) - - base, stride = _base_and_stride(freqstr) - code = _period_str_to_code(base) - - return code, stride - +# ---------------------------------------------------------------------- +# Constants # hack to handle WOM-1MON opattern = re.compile( r'([\-]?\d*|[\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)' ) - -cpdef _base_and_stride(freqstr): - """ - Return base freq and stride info from string representation - - Examples - -------- - _freq_and_stride('5Min') -> 'Min', 5 - """ - groups = opattern.match(freqstr) - - if not groups: - raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) - - stride = groups.group(1) - - if len(stride): - stride = int(stride) - else: - stride = 1 - - base = groups.group(2) - - return (base, stride) - +_INVALID_FREQ_ERROR = "Invalid frequency: {0}" # --------------------------------------------------------------------- # Period codes @@ -147,8 +73,8 @@ _period_code_map = { "S": 9000, # Secondly "L": 10000, # Millisecondly "U": 11000, # Microsecondly - "N": 12000, # Nanosecondly -} + "N": 12000} # Nanosecondly + _reverse_period_code_map = { _period_code_map[key]: key for key in _period_code_map} @@ -159,23 +85,20 @@ _period_code_map.update({'Y' + key[1:]: _period_code_map[key] if key.startswith('A-')}) _period_code_map.update({ - "Q": 2000, # Quarterly - December year end (default quarterly) - "A": 1000, # Annual - "W": 4000, # Weekly - "C": 5000, # Custom Business Day - }) - -_dont_uppercase = set(('MS', 'ms')) + "Q": 2000, # Quarterly - December year end (default quarterly) + "A": 1000, # Annual + "W": 4000, # Weekly + "C": 5000}) # Custom Business Day _lite_rule_alias = { 'W': 'W-SUN', 'Q': 'Q-DEC', - 'A': 'A-DEC', # YearEnd(month=12), + 'A': 'A-DEC', # YearEnd(month=12), 'Y': 'A-DEC', - 'AS': 'AS-JAN', # YearBegin(month=1), + 'AS': 'AS-JAN', # YearBegin(month=1), 'YS': 'AS-JAN', - 'BA': 'BA-DEC', # BYearEnd(month=12), + 'BA': 'BA-DEC', # BYearEnd(month=12), 'BY': 'BA-DEC', 'BAS': 'BAS-JAN', # BYearBegin(month=1), 'BYS': 'BAS-JAN', @@ -186,7 +109,85 @@ _lite_rule_alias = { 'us': 'U', 'ns': 'N'} -_INVALID_FREQ_ERROR = "Invalid frequency: {0}" +_dont_uppercase = set(('MS', 'ms')) + +# ---------------------------------------------------------------------- + +cpdef get_freq_code(freqstr): + """ + Return freq str or tuple to freq code and stride (mult) + + Parameters + ---------- + freqstr : str or tuple + + Returns + ------- + return : tuple of base frequency code and stride (mult) + + Example + ------- + >>> get_freq_code('3D') + (6000, 3) + + >>> get_freq_code('D') + (6000, 1) + + >>> get_freq_code(('D', 3)) + (6000, 3) + """ + if getattr(freqstr, '_typ', None) == 'dateoffset': + freqstr = (freqstr.rule_code, freqstr.n) + + if isinstance(freqstr, tuple): + if (is_integer_object(freqstr[0]) and + is_integer_object(freqstr[1])): + # e.g., freqstr = (2000, 1) + return freqstr + else: + # e.g., freqstr = ('T', 5) + try: + code = _period_str_to_code(freqstr[0]) + stride = freqstr[1] + except: + if is_integer_object(freqstr[1]): + raise + code = _period_str_to_code(freqstr[1]) + stride = freqstr[0] + return code, stride + + if is_integer_object(freqstr): + return (freqstr, 1) + + base, stride = _base_and_stride(freqstr) + code = _period_str_to_code(base) + + return code, stride + + +cpdef _base_and_stride(freqstr): + """ + Return base freq and stride info from string representation + + Examples + -------- + _freq_and_stride('5Min') -> 'Min', 5 + """ + groups = opattern.match(freqstr) + + if not groups: + raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) + + stride = groups.group(1) + + if len(stride): + stride = int(stride) + else: + stride = 1 + + base = groups.group(2) + + return (base, stride) cpdef _period_str_to_code(freqstr): diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 845d1b8dcabba..90882eefd9f67 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -44,10 +44,14 @@ from dateutil.relativedelta import relativedelta from dateutil.parser import DEFAULTPARSER from dateutil.parser import parse as du_parse +# ---------------------------------------------------------------------- +# Constants + class DateParseError(ValueError): pass + _nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, @@ -64,6 +68,8 @@ cdef set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) NAT_SENTINEL = object() # This allows us to reference NaT without having to import it +# ---------------------------------------------------------------------- + def parse_datetime_string(date_string, freq=None, dayfirst=False, yearfirst=False, **kwargs): @@ -199,7 +205,7 @@ cpdef bint _does_string_look_like_datetime(object date_string): cdef inline object _parse_dateabbr_string(object date_string, object default, - object freq): + object freq): cdef: object ret int year, quarter = -1, month, mnum, date_len @@ -317,7 +323,7 @@ def dateutil_parse(object timestr, object default, ignoretz=False, res = DEFAULTPARSER._parse(fobj, **kwargs) # dateutil 2.2 compat - if isinstance(res, tuple): # PyTuple_Check + if isinstance(res, tuple): # PyTuple_Check res, _ = res if res is None: @@ -390,7 +396,7 @@ cpdef object _get_rule_month(object source, object default='DEC'): return source.split('-')[1] -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Parsing for type-inference @@ -404,7 +410,7 @@ def try_parse_dates(ndarray[object] values, parser=None, result = np.empty(n, dtype='O') if parser is None: - if default is None: # GH2618 + if default is None: # GH2618 date = datetime.now() default = datetime(date.year, date.month, 1) @@ -449,7 +455,7 @@ def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, result = np.empty(n, dtype='O') if date_parser is None: - if default is None: # GH2618 + if default is None: # GH2618 date = datetime.now() default = datetime(date.year, date.month, 1) @@ -506,7 +512,7 @@ def try_parse_datetime_components(ndarray[object] years, n = len(years) if (len(months) != n or len(days) != n or len(hours) != n or - len(minutes) != n or len(seconds) != n): + len(minutes) != n or len(seconds) != n): raise ValueError('Length of all datetime components must be equal') result = np.empty(n, dtype='O') @@ -525,7 +531,7 @@ def try_parse_datetime_components(ndarray[object] years, return result -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Miscellaneous _DATEUTIL_LEXER_SPLIT = None diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index a38aa37674e9e..214d7c0f2b432 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -69,6 +69,7 @@ def array_strptime(ndarray[object] values, object fmt, bint is_raise = errors=='raise' bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' + int ordinal assert is_raise or is_ignore or is_coerce @@ -102,7 +103,7 @@ def array_strptime(ndarray[object] values, object fmt, bad_directive = "%" del err raise ValueError("'%s' is a bad directive in format '%s'" % - (bad_directive, fmt)) + (bad_directive, fmt)) # IndexError only occurs when the format string is "%" except IndexError: raise ValueError("stray %% in format '%s'" % fmt) @@ -163,7 +164,7 @@ def array_strptime(ndarray[object] values, object fmt, iresult[i] = NPY_NAT continue raise ValueError("unconverted data remains: %s" % - values[i][found.end():]) + values[i][found.end():]) # search else: @@ -198,8 +199,8 @@ def array_strptime(ndarray[object] values, object fmt, if parse_code == 0: year = int(found_dict['y']) # Open Group specification for strptime() states that a %y - #value in the range of [00, 68] is in the century 2000, while - #[69,99] is in the century 1900 + # value in the range of [00, 68] is in the century 2000, while + # [69,99] is in the century 1900 if year <= 68: year += 2000 else: @@ -296,9 +297,10 @@ def array_strptime(ndarray[object] values, object fmt, if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. - julian = datetime_date(year, month, day).toordinal() - \ - datetime_date(year, 1, 1).toordinal() + 1 - else: # Assume that if they bothered to include Julian day it will + ordinal = datetime_date(year, month, day).toordinal() + julian = ordinal - datetime_date(year, 1, 1).toordinal() + 1 + else: + # Assume that if they bothered to include Julian day it will # be accurate. datetime_result = datetime_date.fromordinal( (julian - 1) + datetime_date(year, 1, 1).toordinal()) @@ -454,8 +456,8 @@ class LocaleTime(object): date_time[1] = time.strftime("%x", time_tuple).lower() date_time[2] = time.strftime("%X", time_tuple).lower() replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), - (self.f_month[3], - '%B'), (self.a_weekday[2], '%a'), + (self.f_month[3], '%B'), + (self.a_weekday[2], '%a'), (self.a_month[3], '%b'), (self.am_pm[1], '%p'), ('1999', '%Y'), ('99', '%y'), ('22', '%H'), ('44', '%M'), ('55', '%S'), ('76', '%j'), @@ -463,7 +465,7 @@ class LocaleTime(object): # '3' needed for when no leading zero. ('2', '%w'), ('10', '%I')] replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone - for tz in tz_values]) + for tz in tz_values]) for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): current_format = date_time[offset] for old, new in replacement_pairs: @@ -536,7 +538,7 @@ class TimeRE(dict): 'w': r"(?P[0-6])", # W is set below by using 'U' 'y': r"(?P\d\d)", - #XXX: Does 'Y' need to worry about having less or more than + # XXX: Does 'Y' need to worry about having less or more than # 4 digits? 'Y': r"(?P\d\d\d\d)", 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), @@ -604,7 +606,7 @@ _cache_lock = _thread_allocate_lock() # DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock # first! _TimeRE_cache = TimeRE() -_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache +_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache _regex_cache = {} @@ -615,7 +617,7 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, assumes the week starts on Sunday or Monday (6 or 0).""" cdef: - int first_weekday, week_0_length, days_to_week + int first_weekday, week_0_length, days_to_week first_weekday = datetime_date(year, 1, 1).weekday() # If we are dealing with the %U directive (week starts on Sunday), it's diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index da1163e25f5c6..10c379ad43a63 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -222,7 +222,7 @@ cdef inline parse_timedelta_string(object ts): elif have_dot: if ((len(number) or len(frac)) and not len(unit) - and current_unit is None): + and current_unit is None): raise ValueError("no units specified") if len(frac) > 0 and len(frac) <= 3: diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7f778dde86e23..7fb48e7c66f47 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -156,9 +156,10 @@ cdef inline object tz_cache_key(object tz): return None -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # UTC Offsets + cpdef get_utcoffset(tzinfo, obj): try: return tzinfo._utcoffset @@ -174,7 +175,7 @@ cdef inline bint is_fixed_offset(object tz): return 0 elif treat_tz_as_pytz(tz): if (len(tz._transition_info) == 0 - and len(tz._utc_transition_times) == 0): + and len(tz._utc_transition_times) == 0): return 1 else: return 0 @@ -246,7 +247,7 @@ cdef object get_dst_info(object tz): # get utc trans times trans_list = get_utc_trans_times_from_dateutil_tz(tz) trans = np.hstack([ - np.array([0], dtype='M8[s]'), # place holder for first item + np.array([0], dtype='M8[s]'), # place holder for 1st item np.array(trans_list, dtype='M8[s]')]).astype( 'M8[ns]') # all trans listed trans = trans.view('i8') diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index b6bd6f92f6199..a95e50785c9b0 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1381,8 +1381,8 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, else: vlow = skiplist.get(idx) vhigh = skiplist.get(idx + 1) - output[i] = (vlow + (vhigh - vlow) * - (quantile * (nobs - 1) - idx)) + output[i] = ((vlow + (vhigh - vlow) * + (quantile * (nobs - 1) - idx))) else: output[i] = NaN diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index fd3f4612fb432..f6383b42d4975 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -1,5 +1,5 @@ # coding: utf-8 -#cython: embedsignature=True +# cython: embedsignature=True from cpython cimport * from libc.stdlib cimport * diff --git a/pandas/io/msgpack/_unpacker.pyx b/pandas/io/msgpack/_unpacker.pyx index 22401d7514f65..05dfaad8b2058 100644 --- a/pandas/io/msgpack/_unpacker.pyx +++ b/pandas/io/msgpack/_unpacker.pyx @@ -1,5 +1,5 @@ # coding: utf-8 -#cython: embedsignature=True +# cython: embedsignature=True from cpython cimport * cdef extern from "Python.h": @@ -20,7 +20,7 @@ cdef extern from "../../src/msgpack/unpack.h": ctypedef struct msgpack_user: bint use_list PyObject* object_hook - bint has_pairs_hook # call object_hook with k-v pairs + bint has_pairs_hook # call object_hook with k-v pairs PyObject* list_hook PyObject* ext_hook char *encoding @@ -100,7 +100,7 @@ def default_read_extended_type(typecode, data): def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=1, encoding=None, unicode_errors="strict", object_pairs_hook=None, ext_hook=ExtType, - Py_ssize_t max_str_len=2147483647, # 2**32-1 + Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, Py_ssize_t max_array_len=2147483647, Py_ssize_t max_map_len=2147483647, @@ -257,7 +257,7 @@ cdef class Unpacker(object): object object_hook=None, object object_pairs_hook=None, object list_hook=None, encoding=None, unicode_errors='strict', int max_buffer_size=0, object ext_hook=ExtType, - Py_ssize_t max_str_len=2147483647, # 2**32-1 + Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, Py_ssize_t max_array_len=2147483647, Py_ssize_t max_map_len=2147483647, @@ -467,8 +467,8 @@ cdef class Unpacker(object): return self._unpack(unpack_construct, None, 1) # for debug. - #def _buf(self): + # def _buf(self): # return PyString_FromStringAndSize(self.buf, self.buf_tail) - #def _off(self): + # def _off(self): # return self.buf_head diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index dadfe7ca87e48..021f3715d472b 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -1,17 +1,17 @@ """ test feather-format compat """ - -import pytest -feather = pytest.importorskip('feather') +from distutils.version import LooseVersion import numpy as np -import pandas as pd -from pandas.io.feather_format import to_feather, read_feather -from feather import FeatherError -from pandas.util.testing import assert_frame_equal, ensure_clean +import pandas as pd import pandas.util.testing as tm -from distutils.version import LooseVersion +from pandas.util.testing import assert_frame_equal, ensure_clean + +import pytest +feather = pytest.importorskip('feather') +from feather import FeatherError # noqa:E402 +from pandas.io.feather_format import to_feather, read_feather # noqa:E402 fv = LooseVersion(feather.__version__) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index a97747b93369f..13bf81889af1a 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -3,39 +3,38 @@ import tempfile from contextlib import contextmanager from warnings import catch_warnings +from distutils.version import LooseVersion import datetime from datetime import timedelta + import numpy as np -import pandas import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, - isna) + isna, compat, concat, Timestamp) + +import pandas.util.testing as tm +from pandas.util.testing import (assert_panel4d_equal, + assert_panel_equal, + assert_frame_equal, + assert_series_equal, + set_timezone) from pandas.compat import (is_platform_windows, is_platform_little_endian, - PY3, PY35, PY36, BytesIO, text_type) + PY3, PY35, PY36, BytesIO, text_type, + range, lrange, u) from pandas.io.formats.printing import pprint_thing from pandas.core.dtypes.common import is_categorical_dtype tables = pytest.importorskip('tables') -from pandas.io.pytables import TableIterator -from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf, +from pandas.io import pytables as pytables # noqa:E402 +from pandas.io.pytables import (TableIterator, # noqa:E402 + HDFStore, get_store, Term, read_hdf, PossibleDataLossError, ClosedFileError) -from pandas.io import pytables as pytables -import pandas.util.testing as tm -from pandas.util.testing import (assert_panel4d_equal, - assert_panel_equal, - assert_frame_equal, - assert_series_equal, - set_timezone) -from pandas import concat, Timestamp -from pandas import compat -from pandas.compat import range, lrange, u -from distutils.version import LooseVersion _default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' else 'zlib') @@ -328,13 +327,13 @@ def test_api_default_format(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - pandas.set_option('io.hdf.default_format', 'fixed') + pd.set_option('io.hdf.default_format', 'fixed') _maybe_remove(store, 'df') store.put('df', df) assert not store.get_storer('df').is_table pytest.raises(ValueError, store.append, 'df2', df) - pandas.set_option('io.hdf.default_format', 'table') + pd.set_option('io.hdf.default_format', 'table') _maybe_remove(store, 'df') store.put('df', df) assert store.get_storer('df').is_table @@ -342,19 +341,19 @@ def test_api_default_format(self): store.append('df2', df) assert store.get_storer('df').is_table - pandas.set_option('io.hdf.default_format', None) + pd.set_option('io.hdf.default_format', None) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - pandas.set_option('io.hdf.default_format', 'fixed') + pd.set_option('io.hdf.default_format', 'fixed') df.to_hdf(path, 'df') with HDFStore(path) as store: assert not store.get_storer('df').is_table pytest.raises(ValueError, df.to_hdf, path, 'df2', append=True) - pandas.set_option('io.hdf.default_format', 'table') + pd.set_option('io.hdf.default_format', 'table') df.to_hdf(path, 'df3') with HDFStore(path) as store: assert store.get_storer('df3').is_table @@ -362,7 +361,7 @@ def test_api_default_format(self): with HDFStore(path) as store: assert store.get_storer('df4').is_table - pandas.set_option('io.hdf.default_format', None) + pd.set_option('io.hdf.default_format', None) def test_keys(self): @@ -1086,7 +1085,7 @@ def _try_decode(x, encoding='latin-1'): examples = [] for dtype in ['category', object]: for val in values: - examples.append(pandas.Series(val, dtype=dtype)) + examples.append(pd.Series(val, dtype=dtype)) def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): with ensure_clean_path(self.path) as store: @@ -1171,13 +1170,13 @@ def test_append_all_nans(self): tm.assert_frame_equal(store['df2'], df) # tests the option io.hdf.dropna_table - pandas.set_option('io.hdf.dropna_table', False) + pd.set_option('io.hdf.dropna_table', False) _maybe_remove(store, 'df3') store.append('df3', df[:10]) store.append('df3', df[10:]) tm.assert_frame_equal(store['df3'], df) - pandas.set_option('io.hdf.dropna_table', True) + pd.set_option('io.hdf.dropna_table', True) _maybe_remove(store, 'df4') store.append('df4', df[:10]) store.append('df4', df[10:]) @@ -2253,7 +2252,7 @@ def test_calendar_roundtrip_issue(self): weekmask_egypt = 'Sun Mon Tue Wed Thu' holidays = ['2012-05-01', datetime.datetime(2013, 5, 1), np.datetime64('2014-05-01')] - bday_egypt = pandas.offsets.CustomBusinessDay( + bday_egypt = pd.offsets.CustomBusinessDay( holidays=holidays, weekmask=weekmask_egypt) dt = datetime.datetime(2013, 4, 30) dts = date_range(dt, periods=5, freq=bday_egypt) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 763e6547ea2cb..be25a439f9075 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -312,7 +312,7 @@ def _get_freq_str(base, mult=1): # --------------------------------------------------------------------- # Offset names ("time rules") and related functions -from pandas._libs.tslibs.offsets import _offset_to_period_map +from pandas._libs.tslibs.offsets import _offset_to_period_map # noqa:E402 from pandas.tseries.offsets import (Nano, Micro, Milli, Second, # noqa Minute, Hour, Day, BDay, CDay, Week, MonthBegin, diff --git a/setup.cfg b/setup.cfg index 0123078523b6f..7a88ee8557dc7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ tag_prefix = v parentdir_prefix = pandas- [flake8] -ignore = E731,E402 +ignore = E731,E402,W503 max-line-length = 79 [yapf] diff --git a/setup.py b/setup.py index ed58329d5fd8f..783ded906eba2 100755 --- a/setup.py +++ b/setup.py @@ -7,10 +7,16 @@ """ import os +from os.path import join as pjoin + import sys import shutil from distutils.version import LooseVersion +# versioning +import versioneer +cmdclass = versioneer.get_cmdclass() + def is_platform_windows(): return sys.platform == 'win32' or sys.platform == 'cygwin' @@ -24,10 +30,6 @@ def is_platform_mac(): return sys.platform == 'darwin' -# versioning -import versioneer -cmdclass = versioneer.get_cmdclass() - min_cython_ver = '0.23' try: import Cython @@ -77,9 +79,9 @@ def is_platform_mac(): " use pip or easy_install." "\n $ pip install 'python-dateutil < 2' 'numpy'") -from distutils.extension import Extension -from distutils.command.build import build -from distutils.command.build_ext import build_ext as _build_ext +from distutils.extension import Extension # noqa:E402 +from distutils.command.build import build # noqa:E402 +from distutils.command.build_ext import build_ext as _build_ext # noqa:E402 try: if not _CYTHON_INSTALLED: @@ -105,9 +107,6 @@ def is_platform_mac(): 'pip install Tempita') -from os.path import join as pjoin - - _pxi_dep_template = { 'algos': ['_libs/algos_common_helper.pxi.in', '_libs/algos_take_helper.pxi.in', From 131068046e8ee3c594714dd45b99ed51d87a9eab Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 1 Nov 2017 06:33:05 -0400 Subject: [PATCH 08/44] TST: separate out grouping-type tests (#18057) --- pandas/tests/groupby/test_functional.py | 371 +++++++ pandas/tests/groupby/test_groupby.py | 1133 +-------------------- pandas/tests/groupby/test_grouping.py | 732 +++++++++++++ pandas/tests/groupby/test_nth.py | 78 ++ pandas/tests/groupby/test_transform.py | 100 +- pandas/tests/groupby/test_value_counts.py | 89 +- 6 files changed, 1304 insertions(+), 1199 deletions(-) create mode 100644 pandas/tests/groupby/test_functional.py create mode 100644 pandas/tests/groupby/test_grouping.py diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py new file mode 100644 index 0000000000000..bc13d51c4f4f6 --- /dev/null +++ b/pandas/tests/groupby/test_functional.py @@ -0,0 +1,371 @@ +# -*- coding: utf-8 -*- + +""" test function application """ + +import pytest + +from string import ascii_lowercase +from pandas import (date_range, Timestamp, + Index, MultiIndex, DataFrame, Series) +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.compat import product as cart_product + +import numpy as np + +import pandas.util.testing as tm +import pandas as pd +from .common import MixIn + + +# describe +# -------------------------------- + +class TestDescribe(MixIn): + + def test_apply_describe_bug(self): + grouped = self.mframe.groupby(level='first') + grouped.describe() # it works! + + def test_series_describe_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + assert_series_equal(result['mean'], grouped.mean(), check_names=False) + assert_series_equal(result['std'], grouped.std(), check_names=False) + assert_series_equal(result['min'], grouped.min(), check_names=False) + + def test_series_describe_single(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack() + assert_series_equal(result, expected) + + def test_series_index_name(self): + grouped = self.df.loc[:, ['C']].groupby(self.df['A']) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == 'A' + + def test_frame_describe_multikey(self): + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in self.tsframe: + group = grouped[col].describe() + group_col = pd.MultiIndex([[col] * len(group.columns), + group.columns], + [[0] * len(group.columns), + range(len(group.columns))]) + group = pd.DataFrame(group.values, + columns=group_col, + index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + groupedT = self.tsframe.groupby({'A': 0, 'B': 0, + 'C': 1, 'D': 1}, axis=1) + result = groupedT.describe() + expected = self.tsframe.describe().T + expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], + [range(4), range(len(expected.index))]) + tm.assert_frame_equal(result, expected) + + def test_frame_describe_tupleindex(self): + + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, + 'y': [10, 20, 30, 40, 50] * 3, + 'z': [100, 200, 300, 400, 500] * 3}) + df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={'k': 'key'}) + pytest.raises(ValueError, lambda: df1.groupby('k').describe()) + pytest.raises(ValueError, lambda: df2.groupby('key').describe()) + + def test_frame_describe_unstacked_format(self): + # GH 4792 + prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} + volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} + df = pd.DataFrame({'PRICE': prices, + 'VOLUME': volumes}) + result = df.groupby('PRICE').VOLUME.describe() + data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist()] + expected = pd.DataFrame(data, + index=pd.Index([24990, 25499], name='PRICE'), + columns=['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) + + +# nunique +# -------------------------------- + +class TestNUnique(MixIn): + + def test_series_groupby_nunique(self): + + def check_nunique(df, keys, as_index=True): + for sort, dropna in cart_product((False, True), repeat=2): + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr['julie'].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr['julie'].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + assert_series_equal(left, right, check_names=False) + + days = date_range('2015-08-23', periods=10) + + for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): + frame = DataFrame({ + 'jim': np.random.choice( + list(ascii_lowercase), n), + 'joe': np.random.choice(days, n), + 'julie': np.random.randint(0, m, n) + }) + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + + frame.loc[1::17, 'jim'] = None + frame.loc[3::37, 'joe'] = None + frame.loc[7::19, 'julie'] = None + frame.loc[8::19, 'julie'] = None + frame.loc[9::19, 'julie'] = None + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ['jim'], as_index=False) + check_nunique(frame, ['jim', 'joe'], as_index=False) + + def test_nunique(self): + df = DataFrame({ + 'A': list('abbacc'), + 'B': list('abxacc'), + 'C': list('abbacx'), + }) + + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) + result = df.groupby('A', as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list('abc') + expected.index.name = 'A' + result = df.groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, + index=list('abc')) + expected.index.name = 'A' + result = df.replace({'x': None}).groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + def test_nunique_with_object(self): + # GH 11077 + data = pd.DataFrame( + [[100, 1, 'Alice'], + [200, 2, 'Bob'], + [300, 3, 'Charlie'], + [-400, 4, 'Dan'], + [500, 5, 'Edith']], + columns=['amount', 'id', 'name'] + ) + + result = data.groupby(['id', 'amount'])['name'].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = pd.Series([1] * 5, name='name', index=index) + tm.assert_series_equal(result, expected) + + def test_nunique_with_empty_series(self): + # GH 12553 + data = pd.Series(name='name') + result = data.groupby(level=0).nunique() + expected = pd.Series(name='name', dtype='int64') + tm.assert_series_equal(result, expected) + + def test_nunique_with_timegrouper(self): + # GH 13453 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + Timestamp('2016-06-28 16:09:30'), + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}).set_index('time') + result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() + expected = test.groupby( + pd.Grouper(freq='h') + )['data'].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + + +# count +# -------------------------------- + +class TestCount(MixIn): + + def test_groupby_timedelta_cython_count(self): + df = DataFrame({'g': list('ab' * 2), + 'delt': np.arange(4).astype('timedelta64[ns]')}) + expected = Series([ + 2, 2 + ], index=pd.Index(['a', 'b'], name='g'), name='delt') + result = df.groupby('g').delt.count() + tm.assert_series_equal(expected, result) + + def test_count(self): + n = 1 << 15 + dr = date_range('2015-08-30', periods=n // 10, freq='T') + + df = DataFrame({ + '1st': np.random.choice( + list(ascii_lowercase), n), + '2nd': np.random.randint(0, 5, n), + '3rd': np.random.randn(n).round(3), + '4th': np.random.randint(-10, 10, n), + '5th': np.random.choice(dr, n), + '6th': np.random.randn(n).round(3), + '7th': np.random.randn(n).round(3), + '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), + '9th': np.random.choice( + list(ascii_lowercase), n) + }) + + for col in df.columns.drop(['1st', '2nd', '4th']): + df.loc[np.random.choice(n, n // 10), col] = np.nan + + df['9th'] = df['9th'].astype('category') + + for key in '1st', '2nd', ['1st', '2nd']: + left = df.groupby(key).count() + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + assert_frame_equal(left, right) + + # GH5610 + # count counts non-nulls + df = pd.DataFrame([[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, np.nan]], + columns=['A', 'B', 'C']) + + count_as = df.groupby('A').count() + count_not_as = df.groupby('A', as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], + index=[1, 3]) + expected.index.name = 'A' + assert_frame_equal(count_not_as, expected.reset_index()) + assert_frame_equal(count_as, expected) + + count_B = df.groupby('A')['B'].count() + assert_series_equal(count_B, expected['B']) + + def test_count_object(self): + df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 3, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 1, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + def test_count_cross_type(self): # GH8169 + vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( + 0, 2, (100, 2)))) + + df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) + df[df == 2] = np.nan + expected = df.groupby(['c', 'd']).count() + + for t in ['float32', 'object']: + df['a'] = df['a'].astype(t) + df['b'] = df['b'].astype(t) + result = df.groupby(['c', 'd']).count() + tm.assert_frame_equal(result, expected) + + def test_lower_int_prec_count(self): + df = DataFrame({'a': np.array( + [0, 1, 2, 100], np.int8), + 'b': np.array( + [1, 2, 3, 6], np.uint32), + 'c': np.array( + [4, 5, 6, 8], np.int16), + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2], + 'b': [2, 2], + 'c': [2, 2]}, index=pd.Index(list('ab'), + name='grp')) + tm.assert_frame_equal(result, expected) + + def test_count_uses_size_on_exception(self): + class RaisingObjectException(Exception): + pass + + class RaisingObject(object): + + def __init__(self, msg='I will raise inside Cython'): + super(RaisingObject, self).__init__() + self.msg = msg + + def __eq__(self, other): + # gets called in Cython to check that raising calls the method + raise RaisingObjectException(self.msg) + + df = DataFrame({'a': [RaisingObject() for _ in range(4)], + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2]}, index=pd.Index( + list('ab'), name='grp')) + tm.assert_frame_equal(result, expected) + + +# size +# -------------------------------- + +class TestSize(MixIn): + + def test_size(self): + grouped = self.df.groupby(['A', 'B']) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = self.df.groupby('A') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = self.df.groupby('B') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) + for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): + left = df.groupby(key, sort=sort).size() + right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) + assert_series_equal(left, right, check_names=False) + + # GH11699 + df = DataFrame([], columns=['A', 'B']) + out = Series([], dtype='int64', index=Index([], name='A')) + assert_series_equal(df.groupby('A').size(), out) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9d25117fbd954..6f022aeff577b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4,19 +4,16 @@ import pytest from warnings import catch_warnings -from string import ascii_lowercase from datetime import datetime -from numpy import nan from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, - concat, Panel, DatetimeIndex) + concat, Panel, DatetimeIndex, read_csv) from pandas.errors import UnsupportedFunctionCall, PerformanceWarning -from pandas.util.testing import (assert_panel_equal, assert_frame_equal, - assert_series_equal, assert_almost_equal, - assert_index_equal) +from pandas.util.testing import (assert_frame_equal, assert_index_equal, + assert_series_equal, assert_almost_equal) from pandas.compat import (range, long, lrange, StringIO, lmap, lzip, map, zip, - builtins, OrderedDict, product as cart_product) + builtins, OrderedDict) from pandas import compat from collections import defaultdict import pandas.core.common as com @@ -76,261 +73,6 @@ def checkit(dtype): for dtype in ['int64', 'int32', 'float64', 'float32']: checkit(dtype) - def test_select_bad_cols(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) - g = df.groupby('A') - pytest.raises(KeyError, g.__getitem__, ['C']) # g[['C']] - - pytest.raises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']] - with tm.assert_raises_regex(KeyError, '^[^A]+$'): - # A should not be referenced as a bad column... - # will have to rethink regex if you change message! - g[['A', 'C']] - - def test_group_selection_cache(self): - # GH 12839 nth, head, and tail should return same result consistently - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - expected = df.iloc[[0, 2]].set_index('A') - - g = df.groupby('A') - result1 = g.head(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.tail(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.head(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.tail(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) - - def test_grouper_index_types(self): - # related GH5375 - # groupby misbehaving when using a Floatlike index - df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB')) - for index in [tm.makeFloatIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, - tm.makePeriodIndex]: - - df.index = index(len(df)) - df.groupby(list('abcde')).apply(lambda x: x) - - df.index = list(reversed(df.index.tolist())) - df.groupby(list('abcde')).apply(lambda x: x) - - def test_grouper_multilevel_freq(self): - - # GH 7885 - # with level and freq specified in a pd.Grouper - from datetime import date, timedelta - d0 = date.today() - timedelta(days=14) - dates = date_range(d0, date.today()) - date_index = pd.MultiIndex.from_product( - [dates, dates], names=['foo', 'bar']) - df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) - - # Check string level - expected = df.reset_index().groupby([pd.Grouper( - key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum() - # reset index changes columns dtype to object - expected.columns = pd.Index([0], dtype='int64') - - result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper( - level='bar', freq='W')]).sum() - assert_frame_equal(result, expected) - - # Check integer level - result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper( - level=1, freq='W')]).sum() - assert_frame_equal(result, expected) - - def test_grouper_creation_bug(self): - - # GH 8795 - df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) - g = df.groupby('A') - expected = g.sum() - - g = df.groupby(pd.Grouper(key='A')) - result = g.sum() - assert_frame_equal(result, expected) - - result = g.apply(lambda x: x.sum()) - assert_frame_equal(result, expected) - - g = df.groupby(pd.Grouper(key='A', axis=0)) - result = g.sum() - assert_frame_equal(result, expected) - - # GH14334 - # pd.Grouper(key=...) may be passed in a list - df = DataFrame({'A': [0, 0, 0, 1, 1, 1], - 'B': [1, 1, 2, 2, 3, 3], - 'C': [1, 2, 3, 4, 5, 6]}) - # Group by single column - expected = df.groupby('A').sum() - g = df.groupby([pd.Grouper(key='A')]) - result = g.sum() - assert_frame_equal(result, expected) - - # Group by two columns - # using a combination of strings and Grouper objects - expected = df.groupby(['A', 'B']).sum() - - # Group with two Grouper objects - g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) - result = g.sum() - assert_frame_equal(result, expected) - - # Group with a string and a Grouper object - g = df.groupby(['A', pd.Grouper(key='B')]) - result = g.sum() - assert_frame_equal(result, expected) - - # Group with a Grouper object and a string - g = df.groupby([pd.Grouper(key='A'), 'B']) - result = g.sum() - assert_frame_equal(result, expected) - - # GH8866 - s = Series(np.arange(8, dtype='int64'), - index=pd.MultiIndex.from_product( - [list('ab'), range(2), - date_range('20130101', periods=2)], - names=['one', 'two', 'three'])) - result = s.groupby(pd.Grouper(level='three', freq='M')).sum() - expected = Series([28], index=Index( - [Timestamp('2013-01-31')], freq='M', name='three')) - assert_series_equal(result, expected) - - # just specifying a level breaks - result = s.groupby(pd.Grouper(level='one')).sum() - expected = s.groupby(level='one').sum() - assert_series_equal(result, expected) - - def test_grouper_column_and_index(self): - # GH 14327 - - # Grouping a multi-index frame by a column and an index level should - # be equivalent to resetting the index and grouping by two columns - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3)]) - idx.names = ['outer', 'inner'] - df_multi = pd.DataFrame({"A": np.arange(6), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one']}, - index=idx) - result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_multi.reset_index().groupby(['B', 'inner']).mean() - assert_frame_equal(result, expected) - - # Test the reverse grouping order - result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_multi.reset_index().groupby(['inner', 'B']).mean() - assert_frame_equal(result, expected) - - # Grouping a single-index frame by a column and the index should - # be equivalent to resetting the index and grouping by two columns - df_single = df_multi.reset_index('outer') - result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_single.reset_index().groupby(['B', 'inner']).mean() - assert_frame_equal(result, expected) - - # Test the reverse grouping order - result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_single.reset_index().groupby(['inner', 'B']).mean() - assert_frame_equal(result, expected) - - def test_grouper_getting_correct_binner(self): - - # GH 10063 - # using a non-time-based grouper and a time-based grouper - # and specifying levels - df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product( - [list('ab'), date_range('20130101', periods=80)], names=['one', - 'two'])) - result = df.groupby([pd.Grouper(level='one'), pd.Grouper( - level='two', freq='M')]).sum() - expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]}, - index=MultiIndex.from_product( - [list('ab'), - date_range('20130101', freq='M', periods=3)], - names=['one', 'two'])) - assert_frame_equal(result, expected) - - def test_grouper_iter(self): - assert sorted(self.df.groupby('A').grouper) == ['bar', 'foo'] - - def test_empty_groups(self): - # see gh-1048 - pytest.raises(ValueError, self.df.groupby, []) - - def test_groupby_grouper(self): - grouped = self.df.groupby('A') - - result = self.df.groupby(grouped.grouper).mean() - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - def test_groupby_duplicated_column_errormsg(self): - # GH7511 - df = DataFrame(columns=['A', 'B', 'A', 'C'], - data=[range(4), range(2, 6), range(0, 8, 2)]) - - pytest.raises(ValueError, df.groupby, 'A') - pytest.raises(ValueError, df.groupby, ['A', 'B']) - - grouped = df.groupby('B') - c = grouped.count() - assert c.columns.nlevels == 1 - assert c.columns.size == 3 - - def test_groupby_dict_mapping(self): - # GH #679 - from pandas import Series - s = Series({'T1': 5}) - result = s.groupby({'T1': 'T2'}).agg(sum) - expected = s.groupby(['T2']).agg(sum) - assert_series_equal(result, expected) - - s = Series([1., 2., 3., 4.], index=list('abcd')) - mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} - - result = s.groupby(mapping).mean() - result2 = s.groupby(mapping).agg(np.mean) - expected = s.groupby([0, 0, 1, 1]).mean() - expected2 = s.groupby([0, 0, 1, 1]).mean() - assert_series_equal(result, expected) - assert_series_equal(result, result2) - assert_series_equal(result, expected2) - - def test_groupby_grouper_f_sanity_checked(self): - dates = date_range('01-Jan-2013', periods=12, freq='MS') - ts = Series(np.random.randn(12), index=dates) - - # GH3035 - # index.map is used to apply grouper to the index - # if it fails on the elements, map tries it on the entire index as - # a sequence. That can yield invalid results that cause trouble - # down the line. - # the surprise comes from using key[0:6] rather then str(key)[0:6] - # when the elements are Timestamp. - # the result is Index[0:6], very confusing. - - pytest.raises(AssertionError, ts.groupby, lambda key: key[0:6]) - def test_groupby_nonobject_dtype(self): key = self.mframe.index.labels[0] grouped = self.mframe.groupby(key) @@ -444,86 +186,6 @@ def f(grp): e.name = None assert_series_equal(result, e) - def test_get_group(self): - with catch_warnings(record=True): - wp = tm.makePanel() - grouped = wp.groupby(lambda x: x.month, axis='major') - - gp = grouped.get_group(1) - expected = wp.reindex( - major=[x for x in wp.major_axis if x.month == 1]) - assert_panel_equal(gp, expected) - - # GH 5267 - # be datelike friendly - df = DataFrame({'DATE': pd.to_datetime( - ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013', - '11-Oct-2013', '11-Oct-2013']), - 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'], - 'VAL': [1, 2, 3, 4, 5, 6]}) - - g = df.groupby('DATE') - key = list(g.groups)[0] - result1 = g.get_group(key) - result2 = g.get_group(Timestamp(key).to_pydatetime()) - result3 = g.get_group(str(Timestamp(key))) - assert_frame_equal(result1, result2) - assert_frame_equal(result1, result3) - - g = df.groupby(['DATE', 'label']) - - key = list(g.groups)[0] - result1 = g.get_group(key) - result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1])) - result3 = g.get_group((str(Timestamp(key[0])), key[1])) - assert_frame_equal(result1, result2) - assert_frame_equal(result1, result3) - - # must pass a same-length tuple with multiple keys - pytest.raises(ValueError, lambda: g.get_group('foo')) - pytest.raises(ValueError, lambda: g.get_group(('foo'))) - pytest.raises(ValueError, - lambda: g.get_group(('foo', 'bar', 'baz'))) - - def test_get_group_empty_bins(self): - - d = pd.DataFrame([3, 1, 7, 6]) - bins = [0, 5, 10, 15] - g = d.groupby(pd.cut(d[0], bins)) - - # TODO: should prob allow a str of Interval work as well - # IOW '(0, 5]' - result = g.get_group(pd.Interval(0, 5)) - expected = DataFrame([3, 1], index=[0, 1]) - assert_frame_equal(result, expected) - - pytest.raises(KeyError, lambda: g.get_group(pd.Interval(10, 15))) - - def test_get_group_grouped_by_tuple(self): - # GH 8121 - df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T - gr = df.groupby('ids') - expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2]) - result = gr.get_group((1, )) - assert_frame_equal(result, expected) - - dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01', - '2010-01-02']) - df = DataFrame({'ids': [(x, ) for x in dt]}) - gr = df.groupby('ids') - result = gr.get_group(('2010-01-01', )) - expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) - assert_frame_equal(result, expected) - - def test_grouping_error_on_multidim_input(self): - from pandas.core.groupby import Grouping - pytest.raises(ValueError, - Grouping, self.df.index, self.df[['A', 'A']]) - - def test_apply_describe_bug(self): - grouped = self.mframe.groupby(level='first') - grouped.describe() # it works! - def test_apply_issues(self): # GH 5788 @@ -604,22 +266,6 @@ def test_len(self): assert len(df.groupby(('b'))) == 3 assert len(df.groupby(('a', 'b'))) == 3 - def test_groups(self): - grouped = self.df.groupby(['A']) - groups = grouped.groups - assert groups is grouped.groups # caching works - - for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k).all() - - grouped = self.df.groupby(['A', 'B']) - groups = grouped.groups - assert groups is grouped.groups # caching works - - for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k[0]).all() - assert (self.df.loc[v]['B'] == k[1]).all() - def test_basic_regression(self): # regression T = [1.0 * x for x in lrange(1, 10) * 10][:1095] @@ -631,13 +277,13 @@ def test_basic_regression(self): grouped = result.groupby(groupings) grouped.mean() - def test_with_na(self): + def test_with_na_groups(self): index = Index(np.arange(10)) for dtype in ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']: values = Series(np.ones(10), index, dtype=dtype) - labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar', - 'bar', nan, 'foo'], index=index) + labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, + 'bar', 'bar', np.nan, 'foo'], index=index) # this SHOULD be an int grouped = values.groupby(labels) @@ -730,81 +376,6 @@ def test_attr_wrapper(self): # make sure raises error pytest.raises(AttributeError, getattr, grouped, 'foo') - def test_series_describe_multikey(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - assert_series_equal(result['mean'], grouped.mean(), check_names=False) - assert_series_equal(result['std'], grouped.std(), check_names=False) - assert_series_equal(result['min'], grouped.min(), check_names=False) - - def test_series_describe_single(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack() - assert_series_equal(result, expected) - - def test_series_index_name(self): - grouped = self.df.loc[:, ['C']].groupby(self.df['A']) - result = grouped.agg(lambda x: x.mean()) - assert result.index.name == 'A' - - def test_frame_describe_multikey(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in self.tsframe: - group = grouped[col].describe() - group_col = pd.MultiIndex([[col] * len(group.columns), - group.columns], - [[0] * len(group.columns), - range(len(group.columns))]) - group = pd.DataFrame(group.values, - columns=group_col, - index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - groupedT = self.tsframe.groupby({'A': 0, 'B': 0, - 'C': 1, 'D': 1}, axis=1) - result = groupedT.describe() - expected = self.tsframe.describe().T - expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], - [range(4), range(len(expected.index))]) - tm.assert_frame_equal(result, expected) - - def test_frame_describe_tupleindex(self): - - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, - 'y': [10, 20, 30, 40, 50] * 3, - 'z': [100, 200, 300, 400, 500] * 3}) - df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={'k': 'key'}) - pytest.raises(ValueError, lambda: df1.groupby('k').describe()) - pytest.raises(ValueError, lambda: df2.groupby('key').describe()) - - def test_frame_describe_unstacked_format(self): - # GH 4792 - prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} - volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} - df = pd.DataFrame({'PRICE': prices, - 'VOLUME': volumes}) - result = df.groupby('PRICE').VOLUME.describe() - data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist()] - expected = pd.DataFrame(data, - index=pd.Index([24990, 25499], name='PRICE'), - columns=['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']) - tm.assert_frame_equal(result, expected) - def test_frame_groupby(self): grouped = self.tsframe.groupby(lambda x: x.weekday()) @@ -845,16 +416,6 @@ def test_frame_groupby(self): samething = self.tsframe.index.take(indices[k]) assert (samething == v).all() - def test_grouping_is_iterable(self): - # this code path isn't used anywhere else - # not sure it's useful - grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year - ]) - - # test it works - for g in grouped.grouper.groupings[0]: - pass - def test_frame_groupby_columns(self): mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} grouped = self.tsframe.groupby(mapping, axis=1) @@ -900,73 +461,6 @@ def test_frame_set_name_single(self): result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) assert result.index.name == 'A' - def test_multi_iter(self): - s = Series(np.arange(6)) - k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) - - grouped = s.groupby([k1, k2]) - - iterated = list(grouped) - expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]), - ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])] - for i, ((one, two), three) in enumerate(iterated): - e1, e2, e3 = expected[i] - assert e1 == one - assert e2 == two - assert_series_equal(three, e3) - - def test_multi_iter_frame(self): - k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': k1, 'k2': k2}, - index=['one', 'two', 'three', 'four', 'five', 'six']) - - grouped = df.groupby(['k1', 'k2']) - - # things get sorted! - iterated = list(grouped) - idx = df.index - expected = [('a', '1', df.loc[idx[[4]]]), - ('a', '2', df.loc[idx[[3, 5]]]), - ('b', '1', df.loc[idx[[0, 2]]]), - ('b', '2', df.loc[idx[[1]]])] - for i, ((one, two), three) in enumerate(iterated): - e1, e2, e3 = expected[i] - assert e1 == one - assert e2 == two - assert_frame_equal(three, e3) - - # don't iterate through groups with no data - df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) - grouped = df.groupby(['k1', 'k2']) - groups = {} - for key, gp in grouped: - groups[key] = gp - assert len(groups) == 2 - - # axis = 1 - three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() - grouped = three_levels.T.groupby(axis=1, level=(1, 2)) - for key, group in grouped: - pass - - def test_multi_iter_panel(self): - with catch_warnings(record=True): - wp = tm.makePanel() - grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()], - axis=1) - - for (month, wd), group in grouped: - exp_axis = [x - for x in wp.major_axis - if x.month == month and x.weekday() == wd] - expected = wp.reindex(major=exp_axis) - assert_panel_equal(group, expected) - def test_multi_func(self): col1 = self.df['A'] col2 = self.df['B'] @@ -1115,79 +609,6 @@ def test_groupby_as_index_agg(self): assert_frame_equal(left, right) - def test_series_groupby_nunique(self): - - def check_nunique(df, keys, as_index=True): - for sort, dropna in cart_product((False, True), repeat=2): - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr['julie'].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr['julie'].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - assert_series_equal(left, right, check_names=False) - - days = date_range('2015-08-23', periods=10) - - for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): - frame = DataFrame({ - 'jim': np.random.choice( - list(ascii_lowercase), n), - 'joe': np.random.choice(days, n), - 'julie': np.random.randint(0, m, n) - }) - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - - frame.loc[1::17, 'jim'] = None - frame.loc[3::37, 'joe'] = None - frame.loc[7::19, 'julie'] = None - frame.loc[8::19, 'julie'] = None - frame.loc[9::19, 'julie'] = None - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - check_nunique(frame, ['jim'], as_index=False) - check_nunique(frame, ['jim', 'joe'], as_index=False) - - def test_multiindex_passthru(self): - - # GH 7997 - # regression from 0.14.1 - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) - - result = df.groupby(axis=1, level=[0, 1]).first() - assert_frame_equal(result, df) - - def test_multiindex_negative_level(self): - # GH 13901 - result = self.mframe.groupby(level=-1).sum() - expected = self.mframe.groupby(level='second').sum() - assert_frame_equal(result, expected) - - result = self.mframe.groupby(level=-2).sum() - expected = self.mframe.groupby(level='first').sum() - assert_frame_equal(result, expected) - - result = self.mframe.groupby(level=[-2, -1]).sum() - expected = self.mframe - assert_frame_equal(result, expected) - - result = self.mframe.groupby(level=[-1, 'first']).sum() - expected = self.mframe.groupby(level=['second', 'first']).sum() - assert_frame_equal(result, expected) - - def test_multifunc_select_col_integer_cols(self): - df = self.df - df.columns = np.arange(len(df.columns)) - - # it works! - df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) - def test_as_index_series_return_frame(self): grouped = self.df.groupby('A', as_index=False) grouped2 = self.df.groupby(['A', 'B'], as_index=False) @@ -1286,55 +707,6 @@ def test_groupby_as_index_apply(self): res = df.groupby(0, as_index=False).apply(lambda x: x).index assert_index_equal(res, ind) - def test_groupby_head_tail(self): - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g_as = df.groupby('A', as_index=True) - g_not_as = df.groupby('A', as_index=False) - - # as_index= False, much easier - assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) - assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - - empty_not_as = DataFrame(columns=df.columns, - index=pd.Index([], dtype=df.index.dtype)) - empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_not_as, g_not_as.head(0)) - assert_frame_equal(empty_not_as, g_not_as.tail(0)) - assert_frame_equal(empty_not_as, g_not_as.head(-1)) - assert_frame_equal(empty_not_as, g_not_as.tail(-1)) - - assert_frame_equal(df, g_not_as.head(7)) # contains all - assert_frame_equal(df, g_not_as.tail(7)) - - # as_index=True, (used to be different) - df_as = df - - assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) - assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) - - empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) - empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_as, g_as.head(0)) - assert_frame_equal(empty_as, g_as.tail(0)) - assert_frame_equal(empty_as, g_as.head(-1)) - assert_frame_equal(empty_as, g_as.tail(-1)) - - assert_frame_equal(df_as, g_as.head(7)) # contains all - assert_frame_equal(df_as, g_as.tail(7)) - - # test with selection - assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - - assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, lambda x: x.month, @@ -1613,15 +985,6 @@ def test_arg_passthru(self): result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - def test_groupby_timedelta_cython_count(self): - df = DataFrame({'g': list('ab' * 2), - 'delt': np.arange(4).astype('timedelta64[ns]')}) - expected = Series([ - 2, 2 - ], index=pd.Index(['a', 'b'], name='g'), name='delt') - result = df.groupby('g').delt.count() - tm.assert_series_equal(expected, result) - def test_wrap_aggregated_output_multindex(self): df = self.mframe.T df['baz', 'two'] = 'peekaboo' @@ -1639,75 +1002,6 @@ def aggfun(ser): agged2 = df.groupby(keys).aggregate(aggfun) assert len(agged2.columns) + 1 == len(df.columns) - @pytest.mark.parametrize('sort', [True, False]) - def test_groupby_level(self, sort): - # GH 17537 - frame = self.mframe - deleveled = frame.reset_index() - - result0 = frame.groupby(level=0, sort=sort).sum() - result1 = frame.groupby(level=1, sort=sort).sum() - - expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() - expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() - - expected0.index.name = 'first' - expected1.index.name = 'second' - - assert result0.index.name == 'first' - assert result1.index.name == 'second' - - assert_frame_equal(result0, expected0) - assert_frame_equal(result1, expected1) - assert result0.index.name == frame.index.names[0] - assert result1.index.name == frame.index.names[1] - - # groupby level name - result0 = frame.groupby(level='first', sort=sort).sum() - result1 = frame.groupby(level='second', sort=sort).sum() - assert_frame_equal(result0, expected0) - assert_frame_equal(result1, expected1) - - # axis=1 - - result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() - result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() - assert_frame_equal(result0, expected0.T) - assert_frame_equal(result1, expected1.T) - - # raise exception for non-MultiIndex - pytest.raises(ValueError, self.df.groupby, level=1) - - def test_groupby_level_index_names(self): - # GH4014 this used to raise ValueError since 'exp'>1 (in py2) - df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3, - 'var1': lrange(6), }).set_index('exp') - df.groupby(level='exp') - pytest.raises(ValueError, df.groupby, level='foo') - - @pytest.mark.parametrize('sort', [True, False]) - def test_groupby_level_with_nas(self, sort): - # GH 17537 - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, - 2, 3]]) - - # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) - result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 22.], index=[0, 1]) - assert_series_equal(result, expected) - - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, - 1, 2, 3]]) - - # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) - result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 18.], index=[0.0, 1.0]) - assert_series_equal(result, expected) - def test_groupby_level_apply(self): frame = self.mframe @@ -1719,22 +1013,6 @@ def test_groupby_level_apply(self): result = frame['A'].groupby(level=0).count() assert result.index.name == 'first' - def test_groupby_args(self): - # PR8618 and issue 8015 - frame = self.mframe - - def j(): - frame.groupby() - - tm.assert_raises_regex(TypeError, "You have to supply one of " - "'by' and 'level'", j) - - def k(): - frame.groupby(by=None, level=None) - - tm.assert_raises_regex(TypeError, "You have to supply one of " - "'by' and 'level'", k) - def test_groupby_level_mapper(self): frame = self.mframe deleveled = frame.reset_index() @@ -1788,21 +1066,6 @@ def test_groupby_complex(self): result = a.sum(level=0) assert_series_equal(result, expected) - @pytest.mark.parametrize('sort,labels', [ - [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], - [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] - ]) - def test_level_preserve_order(self, sort, labels): - # GH 17537 - grouped = self.mframe.groupby(level=0, sort=sort) - exp_labels = np.array(labels, np.intp) - assert_almost_equal(grouped.grouper.labels[0], exp_labels) - - def test_grouping_labels(self): - grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) - exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) - assert_almost_equal(grouped.grouper.labels[0], exp_labels) - def test_apply_series_to_frame(self): def f(piece): with np.errstate(invalid='ignore'): @@ -2014,157 +1277,26 @@ def f(x, q=None, axis=0): assert_frame_equal(agg_result, expected, check_names=False) assert_frame_equal(apply_result, expected) - def test_size(self): - grouped = self.df.groupby(['A', 'B']) - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('A') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('B') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) - for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): - left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) - assert_series_equal(left, right, check_names=False) - - # GH11699 - df = DataFrame([], columns=['A', 'B']) - out = Series([], dtype='int64', index=Index([], name='A')) - assert_series_equal(df.groupby('A').size(), out) - - def test_count(self): - from string import ascii_lowercase - n = 1 << 15 - dr = date_range('2015-08-30', periods=n // 10, freq='T') - - df = DataFrame({ - '1st': np.random.choice( - list(ascii_lowercase), n), - '2nd': np.random.randint(0, 5, n), - '3rd': np.random.randn(n).round(3), - '4th': np.random.randint(-10, 10, n), - '5th': np.random.choice(dr, n), - '6th': np.random.randn(n).round(3), - '7th': np.random.randn(n).round(3), - '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), - '9th': np.random.choice( - list(ascii_lowercase), n) - }) - - for col in df.columns.drop(['1st', '2nd', '4th']): - df.loc[np.random.choice(n, n // 10), col] = np.nan - - df['9th'] = df['9th'].astype('category') - - for key in '1st', '2nd', ['1st', '2nd']: - left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) - assert_frame_equal(left, right) - - # GH5610 - # count counts non-nulls - df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], - columns=['A', 'B', 'C']) - - count_as = df.groupby('A').count() - count_not_as = df.groupby('A', as_index=False).count() - - expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' - assert_frame_equal(count_not_as, expected.reset_index()) - assert_frame_equal(count_as, expected) - - count_B = df.groupby('A')['B'].count() - assert_series_equal(count_B, expected['B']) - - def test_count_object(self): - df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 3, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, - 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 1, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - def test_count_cross_type(self): # GH8169 - vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( - 0, 2, (100, 2)))) - - df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) - df[df == 2] = np.nan - expected = df.groupby(['c', 'd']).count() - - for t in ['float32', 'object']: - df['a'] = df['a'].astype(t) - df['b'] = df['b'].astype(t) - result = df.groupby(['c', 'd']).count() - tm.assert_frame_equal(result, expected) - - def test_nunique(self): - df = DataFrame({ - 'A': list('abbacc'), - 'B': list('abxacc'), - 'C': list('abbacx'), - }) - - expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) - result = df.groupby('A', as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list('abc') - expected.index.name = 'A' - result = df.groupby('A').nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({'x': None}).groupby('A').nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, - index=list('abc')) - expected.index.name = 'A' - result = df.replace({'x': None}).groupby('A').nunique() - tm.assert_frame_equal(result, expected) - def test_non_cython_api(self): # GH5610 # non-cython calls should not include the grouper df = DataFrame( - [[1, 2, 'foo'], [1, - nan, - 'bar', ], [3, nan, 'baz'] - ], columns=['A', 'B', 'C']) + [[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, 'baz']], + columns=['A', 'B', 'C']) g = df.groupby('A') gni = df.groupby('A', as_index=False) # mad - expected = DataFrame([[0], [nan]], columns=['B'], index=[1, 3]) + expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) expected.index.name = 'A' result = g.mad() assert_frame_equal(result, expected) - expected = DataFrame([[0., 0.], [0, nan]], columns=['A', 'B'], + expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], index=[0, 1]) result = gni.mad() assert_frame_equal(result, expected) @@ -2175,8 +1307,9 @@ def test_non_cython_api(self): ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']], labels=[[0] * 8, list(range(8))]) - expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, nan, nan, nan, nan, nan, nan, nan]], + expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan]], index=expected_index, columns=expected_col) result = g.describe() @@ -2196,7 +1329,7 @@ def test_non_cython_api(self): assert_frame_equal(result, expected) # idxmax - expected = DataFrame([[0.0], [nan]], columns=['B'], index=[1, 3]) + expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) expected.index.name = 'A' result = g.idxmax() assert_frame_equal(result, expected) @@ -2319,7 +1452,6 @@ def f(g): assert 'value3' in result def test_groupby_wrong_multi_labels(self): - from pandas import read_csv data = """index,foo,bar,baz,spam,data 0,foo1,bar1,baz1,spam2,20 1,foo1,bar2,baz1,spam3,30 @@ -2620,14 +1752,6 @@ def test_groupby_nat_exclude(self): pytest.raises(KeyError, grouped.get_group, np.nan) pytest.raises(KeyError, grouped.get_group, pd.NaT) - def test_dictify(self): - dict(iter(self.df.groupby('A'))) - dict(iter(self.df.groupby(['A', 'B']))) - dict(iter(self.df['C'].groupby(self.df['A']))) - dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) - dict(iter(self.df.groupby('A')['C'])) - dict(iter(self.df.groupby(['A', 'B'])['C'])) - def test_sparse_friendly(self): sdf = self.df[['C', 'D']].to_sparse() with catch_warnings(record=True): @@ -2734,16 +1858,6 @@ def test_intercept_builtin_sum(self): assert_series_equal(result, expected) assert_series_equal(result2, expected) - def test_column_select_via_attr(self): - result = self.df.groupby('A').C.sum() - expected = self.df.groupby('A')['C'].sum() - assert_series_equal(result, expected) - - self.df['mean'] = 1.5 - result = self.df.groupby('A').mean() - expected = self.df.groupby('A').agg(np.mean) - assert_frame_equal(result, expected) - def test_rank_apply(self): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) @@ -2835,40 +1949,6 @@ def g(group): assert isinstance(result, Series) assert_series_equal(result, expected) - def test_getitem_list_of_columns(self): - df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), - 'E': np.random.randn(8)}) - - result = df.groupby('A')[['C', 'D']].mean() - result2 = df.groupby('A')['C', 'D'].mean() - result3 = df.groupby('A')[df.columns[2:4]].mean() - - expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() - - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - - def test_getitem_numeric_column_names(self): - # GH #13731 - df = DataFrame({0: list('abcd') * 2, - 2: np.random.randn(8), - 4: np.random.randn(8), - 6: np.random.randn(8)}) - result = df.groupby(0)[df.columns[1:3]].mean() - result2 = df.groupby(0)[2, 4].mean() - result3 = df.groupby(0)[[2, 4]].mean() - - expected = df.loc[:, [0, 2, 4]].groupby(0).mean() - - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - def test_set_group_name(self): def f(group): assert group.name is not None @@ -3149,22 +2229,6 @@ def test_groupby_multiindex_not_lexsorted(self): expected = df.sort_index() tm.assert_frame_equal(expected, result) - def test_groupby_levels_and_columns(self): - # GH9344, GH9049 - idx_names = ['x', 'y'] - idx = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) - df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) - - by_levels = df.groupby(level=idx_names).mean() - # reset_index changes columns dtype to object - by_columns = df.reset_index().groupby(idx_names).mean() - - tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) - - by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) - tm.assert_frame_equal(by_levels, by_columns) - def test_gb_apply_list_of_unequal_len_arrays(self): # GH1738 @@ -3189,74 +2253,6 @@ def noddy(value, weight): # don't die df_grouped.apply(lambda x: noddy(x.value, x.weight)) - def test_groupby_with_empty(self): - index = pd.DatetimeIndex(()) - data = () - series = pd.Series(data, index) - grouper = pd.Grouper(freq='D') - grouped = series.groupby(grouper) - assert next(iter(grouped), None) is None - - def test_groupby_with_single_column(self): - df = pd.DataFrame({'a': list('abssbab')}) - tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) - # GH 13530 - exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a')) - tm.assert_frame_equal(df.groupby('a').count(), exp) - tm.assert_frame_equal(df.groupby('a').sum(), exp) - tm.assert_frame_equal(df.groupby('a').nth(1), exp) - - def test_groupby_with_small_elem(self): - # GH 8542 - # length=2 - df = pd.DataFrame({'event': ['start', 'start'], - 'change': [1234, 5678]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) - assert len(grouped.groups) == 2 - assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) - tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[1], :]) - - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-09-15'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) - assert len(grouped.groups) == 2 - assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) - tm.assert_frame_equal(res, df.iloc[[0, 2], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[1], :]) - - # length=3 - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-08-05'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) - assert len(grouped.groups) == 3 - assert grouped.ngroups == 3 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups - - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) - tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[1], :]) - res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) - tm.assert_frame_equal(res, df.iloc[[2], :]) - def test_fill_constistency(self): # GH9221 @@ -3303,42 +2299,6 @@ def test_index_label_overlaps_location(self): expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) - def test_lower_int_prec_count(self): - df = DataFrame({'a': np.array( - [0, 1, 2, 100], np.int8), - 'b': np.array( - [1, 2, 3, 6], np.uint32), - 'c': np.array( - [4, 5, 6, 8], np.int16), - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2], - 'b': [2, 2], - 'c': [2, 2]}, index=pd.Index(list('ab'), - name='grp')) - tm.assert_frame_equal(result, expected) - - def test_count_uses_size_on_exception(self): - class RaisingObjectException(Exception): - pass - - class RaisingObject(object): - - def __init__(self, msg='I will raise inside Cython'): - super(RaisingObject, self).__init__() - self.msg = msg - - def __eq__(self, other): - # gets called in Cython to check that raising calls the method - raise RaisingObjectException(self.msg) - - df = DataFrame({'a': [RaisingObject() for _ in range(4)], - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2]}, index=pd.Index( - list('ab'), name='grp')) - tm.assert_frame_equal(result, expected) - def test_groupby_cumprod(self): # GH 4095 df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) @@ -3510,42 +2470,6 @@ def test_sort(x): g.apply(test_sort) - def test_nunique_with_object(self): - # GH 11077 - data = pd.DataFrame( - [[100, 1, 'Alice'], - [200, 2, 'Bob'], - [300, 3, 'Charlie'], - [-400, 4, 'Dan'], - [500, 5, 'Edith']], - columns=['amount', 'id', 'name'] - ) - - result = data.groupby(['id', 'amount'])['name'].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name='name', index=index) - tm.assert_series_equal(result, expected) - - def test_nunique_with_empty_series(self): - # GH 12553 - data = pd.Series(name='name') - result = data.groupby(level=0).nunique() - expected = pd.Series(name='name', dtype='int64') - tm.assert_series_equal(result, expected) - - def test_nunique_with_timegrouper(self): - # GH 13453 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - Timestamp('2016-06-28 16:09:30'), - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}).set_index('time') - result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() - expected = test.groupby( - pd.Grouper(freq='h') - )['data'].apply(pd.Series.nunique) - tm.assert_series_equal(result, expected) - def test_numpy_compat(self): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) @@ -3559,16 +2483,6 @@ def test_numpy_compat(self): tm.assert_raises_regex(UnsupportedFunctionCall, msg, getattr(g, func), foo=1) - def test_grouping_string_repr(self): - # GH 13394 - mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) - df = DataFrame([[1, 2, 3]], columns=mi) - gr = df.groupby(df[('A', 'a')]) - - result = gr.grouper.groupings[0].__repr__() - expected = "Grouping(('A', 'a'))" - assert result == expected - def test_group_shift_with_null_key(self): # This test is designed to replicate the segfault in issue #13813. n_rows = 1200 @@ -3749,19 +2663,6 @@ def predictions(tool): result = df2.groupby('Key').apply(predictions).p1 tm.assert_series_equal(expected, result) - def test_gb_key_len_equal_axis_len(self): - # GH16843 - # test ensures that index and column keys are recognized correctly - # when number of keys equals axis length of groupby - df = pd.DataFrame([['foo', 'bar', 'B', 1], - ['foo', 'bar', 'B', 2], - ['foo', 'baz', 'C', 3]], - columns=['first', 'second', 'third', 'one']) - df = df.set_index(['first', 'second']) - df = df.groupby(['first', 'second', 'third']).size() - assert df.loc[('foo', 'bar', 'B')] == 2 - assert df.loc[('foo', 'baz', 'C')] == 1 - def test_pipe(self): # Test the pipe method of DataFrameGroupBy. # Issue #17871 diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py new file mode 100644 index 0000000000000..824c754a5d753 --- /dev/null +++ b/pandas/tests/groupby/test_grouping.py @@ -0,0 +1,732 @@ +# -*- coding: utf-8 -*- + +""" test where we are determining what we are grouping, or getting groups """ + +import pytest + +from warnings import catch_warnings +from pandas import (date_range, Timestamp, + Index, MultiIndex, DataFrame, Series) +from pandas.util.testing import (assert_panel_equal, assert_frame_equal, + assert_series_equal, assert_almost_equal) +from pandas.compat import lrange + +from pandas import compat +import numpy as np + +import pandas.util.testing as tm +import pandas as pd +from .common import MixIn + + +# selection +# -------------------------------- + +class TestSelection(MixIn): + + def test_select_bad_cols(self): + df = DataFrame([[1, 2]], columns=['A', 'B']) + g = df.groupby('A') + pytest.raises(KeyError, g.__getitem__, ['C']) # g[['C']] + + pytest.raises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']] + with tm.assert_raises_regex(KeyError, '^[^A]+$'): + # A should not be referenced as a bad column... + # will have to rethink regex if you change message! + g[['A', 'C']] + + def test_groupby_duplicated_column_errormsg(self): + # GH7511 + df = DataFrame(columns=['A', 'B', 'A', 'C'], + data=[range(4), range(2, 6), range(0, 8, 2)]) + + pytest.raises(ValueError, df.groupby, 'A') + pytest.raises(ValueError, df.groupby, ['A', 'B']) + + grouped = df.groupby('B') + c = grouped.count() + assert c.columns.nlevels == 1 + assert c.columns.size == 3 + + def test_column_select_via_attr(self): + result = self.df.groupby('A').C.sum() + expected = self.df.groupby('A')['C'].sum() + assert_series_equal(result, expected) + + self.df['mean'] = 1.5 + result = self.df.groupby('A').mean() + expected = self.df.groupby('A').agg(np.mean) + assert_frame_equal(result, expected) + + def test_getitem_list_of_columns(self): + df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': np.random.randn(8)}) + + result = df.groupby('A')[['C', 'D']].mean() + result2 = df.groupby('A')['C', 'D'].mean() + result3 = df.groupby('A')[df.columns[2:4]].mean() + + expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + + def test_getitem_numeric_column_names(self): + # GH #13731 + df = DataFrame({0: list('abcd') * 2, + 2: np.random.randn(8), + 4: np.random.randn(8), + 6: np.random.randn(8)}) + result = df.groupby(0)[df.columns[1:3]].mean() + result2 = df.groupby(0)[2, 4].mean() + result3 = df.groupby(0)[[2, 4]].mean() + + expected = df.loc[:, [0, 2, 4]].groupby(0).mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + + +# grouping +# -------------------------------- + +class TestGrouping(MixIn): + + def test_grouper_index_types(self): + # related GH5375 + # groupby misbehaving when using a Floatlike index + df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB')) + for index in [tm.makeFloatIndex, tm.makeStringIndex, + tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, + tm.makePeriodIndex]: + + df.index = index(len(df)) + df.groupby(list('abcde')).apply(lambda x: x) + + df.index = list(reversed(df.index.tolist())) + df.groupby(list('abcde')).apply(lambda x: x) + + def test_grouper_multilevel_freq(self): + + # GH 7885 + # with level and freq specified in a pd.Grouper + from datetime import date, timedelta + d0 = date.today() - timedelta(days=14) + dates = date_range(d0, date.today()) + date_index = pd.MultiIndex.from_product( + [dates, dates], names=['foo', 'bar']) + df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) + + # Check string level + expected = df.reset_index().groupby([pd.Grouper( + key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum() + # reset index changes columns dtype to object + expected.columns = pd.Index([0], dtype='int64') + + result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper( + level='bar', freq='W')]).sum() + assert_frame_equal(result, expected) + + # Check integer level + result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper( + level=1, freq='W')]).sum() + assert_frame_equal(result, expected) + + def test_grouper_creation_bug(self): + + # GH 8795 + df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) + g = df.groupby('A') + expected = g.sum() + + g = df.groupby(pd.Grouper(key='A')) + result = g.sum() + assert_frame_equal(result, expected) + + result = g.apply(lambda x: x.sum()) + assert_frame_equal(result, expected) + + g = df.groupby(pd.Grouper(key='A', axis=0)) + result = g.sum() + assert_frame_equal(result, expected) + + # GH14334 + # pd.Grouper(key=...) may be passed in a list + df = DataFrame({'A': [0, 0, 0, 1, 1, 1], + 'B': [1, 1, 2, 2, 3, 3], + 'C': [1, 2, 3, 4, 5, 6]}) + # Group by single column + expected = df.groupby('A').sum() + g = df.groupby([pd.Grouper(key='A')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group by two columns + # using a combination of strings and Grouper objects + expected = df.groupby(['A', 'B']).sum() + + # Group with two Grouper objects + g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group with a string and a Grouper object + g = df.groupby(['A', pd.Grouper(key='B')]) + result = g.sum() + assert_frame_equal(result, expected) + + # Group with a Grouper object and a string + g = df.groupby([pd.Grouper(key='A'), 'B']) + result = g.sum() + assert_frame_equal(result, expected) + + # GH8866 + s = Series(np.arange(8, dtype='int64'), + index=pd.MultiIndex.from_product( + [list('ab'), range(2), + date_range('20130101', periods=2)], + names=['one', 'two', 'three'])) + result = s.groupby(pd.Grouper(level='three', freq='M')).sum() + expected = Series([28], index=Index( + [Timestamp('2013-01-31')], freq='M', name='three')) + assert_series_equal(result, expected) + + # just specifying a level breaks + result = s.groupby(pd.Grouper(level='one')).sum() + expected = s.groupby(level='one').sum() + assert_series_equal(result, expected) + + def test_grouper_column_and_index(self): + # GH 14327 + + # Grouping a multi-index frame by a column and an index level should + # be equivalent to resetting the index and grouping by two columns + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), + ('b', 1), ('b', 2), ('b', 3)]) + idx.names = ['outer', 'inner'] + df_multi = pd.DataFrame({"A": np.arange(6), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one']}, + index=idx) + result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_multi.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_multi.reset_index().groupby(['inner', 'B']).mean() + assert_frame_equal(result, expected) + + # Grouping a single-index frame by a column and the index should + # be equivalent to resetting the index and grouping by two columns + df_single = df_multi.reset_index('outer') + result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_single.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_single.reset_index().groupby(['inner', 'B']).mean() + assert_frame_equal(result, expected) + + def test_groupby_levels_and_columns(self): + # GH9344, GH9049 + idx_names = ['x', 'y'] + idx = pd.MultiIndex.from_tuples( + [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) + df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) + + by_levels = df.groupby(level=idx_names).mean() + # reset_index changes columns dtype to object + by_columns = df.reset_index().groupby(idx_names).mean() + + tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) + + by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) + tm.assert_frame_equal(by_levels, by_columns) + + def test_grouper_getting_correct_binner(self): + + # GH 10063 + # using a non-time-based grouper and a time-based grouper + # and specifying levels + df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product( + [list('ab'), date_range('20130101', periods=80)], names=['one', + 'two'])) + result = df.groupby([pd.Grouper(level='one'), pd.Grouper( + level='two', freq='M')]).sum() + expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]}, + index=MultiIndex.from_product( + [list('ab'), + date_range('20130101', freq='M', periods=3)], + names=['one', 'two'])) + assert_frame_equal(result, expected) + + def test_grouper_iter(self): + assert sorted(self.df.groupby('A').grouper) == ['bar', 'foo'] + + def test_empty_groups(self): + # see gh-1048 + pytest.raises(ValueError, self.df.groupby, []) + + def test_groupby_grouper(self): + grouped = self.df.groupby('A') + + result = self.df.groupby(grouped.grouper).mean() + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_groupby_dict_mapping(self): + # GH #679 + from pandas import Series + s = Series({'T1': 5}) + result = s.groupby({'T1': 'T2'}).agg(sum) + expected = s.groupby(['T2']).agg(sum) + assert_series_equal(result, expected) + + s = Series([1., 2., 3., 4.], index=list('abcd')) + mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} + + result = s.groupby(mapping).mean() + result2 = s.groupby(mapping).agg(np.mean) + expected = s.groupby([0, 0, 1, 1]).mean() + expected2 = s.groupby([0, 0, 1, 1]).mean() + assert_series_equal(result, expected) + assert_series_equal(result, result2) + assert_series_equal(result, expected2) + + def test_groupby_grouper_f_sanity_checked(self): + dates = date_range('01-Jan-2013', periods=12, freq='MS') + ts = Series(np.random.randn(12), index=dates) + + # GH3035 + # index.map is used to apply grouper to the index + # if it fails on the elements, map tries it on the entire index as + # a sequence. That can yield invalid results that cause trouble + # down the line. + # the surprise comes from using key[0:6] rather then str(key)[0:6] + # when the elements are Timestamp. + # the result is Index[0:6], very confusing. + + pytest.raises(AssertionError, ts.groupby, lambda key: key[0:6]) + + def test_grouping_error_on_multidim_input(self): + from pandas.core.groupby import Grouping + pytest.raises(ValueError, + Grouping, self.df.index, self.df[['A', 'A']]) + + def test_multiindex_passthru(self): + + # GH 7997 + # regression from 0.14.1 + df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) + + result = df.groupby(axis=1, level=[0, 1]).first() + assert_frame_equal(result, df) + + def test_multiindex_negative_level(self): + # GH 13901 + result = self.mframe.groupby(level=-1).sum() + expected = self.mframe.groupby(level='second').sum() + assert_frame_equal(result, expected) + + result = self.mframe.groupby(level=-2).sum() + expected = self.mframe.groupby(level='first').sum() + assert_frame_equal(result, expected) + + result = self.mframe.groupby(level=[-2, -1]).sum() + expected = self.mframe + assert_frame_equal(result, expected) + + result = self.mframe.groupby(level=[-1, 'first']).sum() + expected = self.mframe.groupby(level=['second', 'first']).sum() + assert_frame_equal(result, expected) + + def test_multifunc_select_col_integer_cols(self): + df = self.df + df.columns = np.arange(len(df.columns)) + + # it works! + df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) + + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level(self, sort): + # GH 17537 + frame = self.mframe + deleveled = frame.reset_index() + + result0 = frame.groupby(level=0, sort=sort).sum() + result1 = frame.groupby(level=1, sort=sort).sum() + + expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() + expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() + + expected0.index.name = 'first' + expected1.index.name = 'second' + + assert result0.index.name == 'first' + assert result1.index.name == 'second' + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + assert result0.index.name == frame.index.names[0] + assert result1.index.name == frame.index.names[1] + + # groupby level name + result0 = frame.groupby(level='first', sort=sort).sum() + result1 = frame.groupby(level='second', sort=sort).sum() + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + # axis=1 + + result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() + result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() + assert_frame_equal(result0, expected0.T) + assert_frame_equal(result1, expected1.T) + + # raise exception for non-MultiIndex + pytest.raises(ValueError, self.df.groupby, level=1) + + def test_groupby_level_index_names(self): + # GH4014 this used to raise ValueError since 'exp'>1 (in py2) + df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3, + 'var1': lrange(6), }).set_index('exp') + df.groupby(level='exp') + pytest.raises(ValueError, df.groupby, level='foo') + + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level_with_nas(self, sort): + # GH 17537 + index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], + labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, + 2, 3]]) + + # factorizing doesn't confuse things + s = Series(np.arange(8.), index=index) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 22.], index=[0, 1]) + assert_series_equal(result, expected) + + index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], + labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, + 1, 2, 3]]) + + # factorizing doesn't confuse things + s = Series(np.arange(8.), index=index) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 18.], index=[0.0, 1.0]) + assert_series_equal(result, expected) + + def test_groupby_args(self): + # PR8618 and issue 8015 + frame = self.mframe + + def j(): + frame.groupby() + + tm.assert_raises_regex(TypeError, "You have to supply one of " + "'by' and 'level'", j) + + def k(): + frame.groupby(by=None, level=None) + + tm.assert_raises_regex(TypeError, "You have to supply one of " + "'by' and 'level'", k) + + @pytest.mark.parametrize('sort,labels', [ + [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], + [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] + ]) + def test_level_preserve_order(self, sort, labels): + # GH 17537 + grouped = self.mframe.groupby(level=0, sort=sort) + exp_labels = np.array(labels, np.intp) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + def test_grouping_labels(self): + grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + +# get_group +# -------------------------------- + +class TestGetGroup(MixIn): + + def test_get_group(self): + with catch_warnings(record=True): + wp = tm.makePanel() + grouped = wp.groupby(lambda x: x.month, axis='major') + + gp = grouped.get_group(1) + expected = wp.reindex( + major=[x for x in wp.major_axis if x.month == 1]) + assert_panel_equal(gp, expected) + + # GH 5267 + # be datelike friendly + df = DataFrame({'DATE': pd.to_datetime( + ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013', + '11-Oct-2013', '11-Oct-2013']), + 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'], + 'VAL': [1, 2, 3, 4, 5, 6]}) + + g = df.groupby('DATE') + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group(Timestamp(key).to_pydatetime()) + result3 = g.get_group(str(Timestamp(key))) + assert_frame_equal(result1, result2) + assert_frame_equal(result1, result3) + + g = df.groupby(['DATE', 'label']) + + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1])) + result3 = g.get_group((str(Timestamp(key[0])), key[1])) + assert_frame_equal(result1, result2) + assert_frame_equal(result1, result3) + + # must pass a same-length tuple with multiple keys + pytest.raises(ValueError, lambda: g.get_group('foo')) + pytest.raises(ValueError, lambda: g.get_group(('foo'))) + pytest.raises(ValueError, + lambda: g.get_group(('foo', 'bar', 'baz'))) + + def test_get_group_empty_bins(self): + + d = pd.DataFrame([3, 1, 7, 6]) + bins = [0, 5, 10, 15] + g = d.groupby(pd.cut(d[0], bins)) + + # TODO: should prob allow a str of Interval work as well + # IOW '(0, 5]' + result = g.get_group(pd.Interval(0, 5)) + expected = DataFrame([3, 1], index=[0, 1]) + assert_frame_equal(result, expected) + + pytest.raises(KeyError, lambda: g.get_group(pd.Interval(10, 15))) + + def test_get_group_grouped_by_tuple(self): + # GH 8121 + df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T + gr = df.groupby('ids') + expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2]) + result = gr.get_group((1, )) + assert_frame_equal(result, expected) + + dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01', + '2010-01-02']) + df = DataFrame({'ids': [(x, ) for x in dt]}) + gr = df.groupby('ids') + result = gr.get_group(('2010-01-01', )) + expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) + assert_frame_equal(result, expected) + + def test_groupby_with_empty(self): + index = pd.DatetimeIndex(()) + data = () + series = pd.Series(data, index) + grouper = pd.Grouper(freq='D') + grouped = series.groupby(grouper) + assert next(iter(grouped), None) is None + + def test_groupby_with_single_column(self): + df = pd.DataFrame({'a': list('abssbab')}) + tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) + # GH 13530 + exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a')) + tm.assert_frame_equal(df.groupby('a').count(), exp) + tm.assert_frame_equal(df.groupby('a').sum(), exp) + tm.assert_frame_equal(df.groupby('a').nth(1), exp) + + def test_gb_key_len_equal_axis_len(self): + # GH16843 + # test ensures that index and column keys are recognized correctly + # when number of keys equals axis length of groupby + df = pd.DataFrame([['foo', 'bar', 'B', 1], + ['foo', 'bar', 'B', 2], + ['foo', 'baz', 'C', 3]], + columns=['first', 'second', 'third', 'one']) + df = df.set_index(['first', 'second']) + df = df.groupby(['first', 'second', 'third']).size() + assert df.loc[('foo', 'bar', 'B')] == 2 + assert df.loc[('foo', 'baz', 'C')] == 1 + + +# groups & iteration +# -------------------------------- + +class TestIteration(MixIn): + + def test_groups(self): + grouped = self.df.groupby(['A']) + groups = grouped.groups + assert groups is grouped.groups # caching works + + for k, v in compat.iteritems(grouped.groups): + assert (self.df.loc[v]['A'] == k).all() + + grouped = self.df.groupby(['A', 'B']) + groups = grouped.groups + assert groups is grouped.groups # caching works + + for k, v in compat.iteritems(grouped.groups): + assert (self.df.loc[v]['A'] == k[0]).all() + assert (self.df.loc[v]['B'] == k[1]).all() + + def test_grouping_is_iterable(self): + # this code path isn't used anywhere else + # not sure it's useful + grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year + ]) + + # test it works + for g in grouped.grouper.groupings[0]: + pass + + def test_multi_iter(self): + s = Series(np.arange(6)) + k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + + grouped = s.groupby([k1, k2]) + + iterated = list(grouped) + expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]), + ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + assert e1 == one + assert e2 == two + assert_series_equal(three, e3) + + def test_multi_iter_frame(self): + k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + df = DataFrame({'v1': np.random.randn(6), + 'v2': np.random.randn(6), + 'k1': k1, 'k2': k2}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + + grouped = df.groupby(['k1', 'k2']) + + # things get sorted! + iterated = list(grouped) + idx = df.index + expected = [('a', '1', df.loc[idx[[4]]]), + ('a', '2', df.loc[idx[[3, 5]]]), + ('b', '1', df.loc[idx[[0, 2]]]), + ('b', '2', df.loc[idx[[1]]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + assert e1 == one + assert e2 == two + assert_frame_equal(three, e3) + + # don't iterate through groups with no data + df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) + grouped = df.groupby(['k1', 'k2']) + groups = {} + for key, gp in grouped: + groups[key] = gp + assert len(groups) == 2 + + # axis = 1 + three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() + grouped = three_levels.T.groupby(axis=1, level=(1, 2)) + for key, group in grouped: + pass + + def test_multi_iter_panel(self): + with catch_warnings(record=True): + wp = tm.makePanel() + grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()], + axis=1) + + for (month, wd), group in grouped: + exp_axis = [x + for x in wp.major_axis + if x.month == month and x.weekday() == wd] + expected = wp.reindex(major=exp_axis) + assert_panel_equal(group, expected) + + def test_dictify(self): + dict(iter(self.df.groupby('A'))) + dict(iter(self.df.groupby(['A', 'B']))) + dict(iter(self.df['C'].groupby(self.df['A']))) + dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) + dict(iter(self.df.groupby('A')['C'])) + dict(iter(self.df.groupby(['A', 'B'])['C'])) + + def test_groupby_with_small_elem(self): + # GH 8542 + # length=2 + df = pd.DataFrame({'event': ['start', 'start'], + 'change': [1234, 5678]}, + index=pd.DatetimeIndex(['2014-09-10', '2013-10-10'])) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + assert len(grouped.groups) == 2 + assert grouped.ngroups == 2 + assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups + assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + + res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + tm.assert_frame_equal(res, df.iloc[[0], :]) + res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[1], :]) + + df = pd.DataFrame({'event': ['start', 'start', 'start'], + 'change': [1234, 5678, 9123]}, + index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', + '2014-09-15'])) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + assert len(grouped.groups) == 2 + assert grouped.ngroups == 2 + assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups + assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + + res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + tm.assert_frame_equal(res, df.iloc[[0, 2], :]) + res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[1], :]) + + # length=3 + df = pd.DataFrame({'event': ['start', 'start', 'start'], + 'change': [1234, 5678, 9123]}, + index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', + '2014-08-05'])) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + assert len(grouped.groups) == 3 + assert grouped.ngroups == 3 + assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups + assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups + + res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + tm.assert_frame_equal(res, df.iloc[[0], :]) + res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[1], :]) + res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) + tm.assert_frame_equal(res, df.iloc[[2], :]) + + def test_grouping_string_repr(self): + # GH 13394 + mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) + df = DataFrame([[1, 2, 3]], columns=mi) + gr = df.groupby(df[('A', 'a')]) + + result = gr.grouper.groupings[0].__repr__() + expected = "Grouping(('A', 'a'))" + assert result == expected diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index ffbede0eb208f..501fe63137cf4 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -238,6 +238,84 @@ def test_nth_multi_index_as_expected(self): names=['A', 'B'])) assert_frame_equal(result, expected) + def test_groupby_head_tail(self): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False, much easier + assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) + + empty_not_as = DataFrame(columns=df.columns, + index=pd.Index([], dtype=df.index.dtype)) + empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_not_as, g_not_as.head(0)) + assert_frame_equal(empty_not_as, g_not_as.tail(0)) + assert_frame_equal(empty_not_as, g_not_as.head(-1)) + assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + assert_frame_equal(df, g_not_as.head(7)) # contains all + assert_frame_equal(df, g_not_as.tail(7)) + + # as_index=True, (used to be different) + df_as = df + + assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_as, g_as.head(0)) + assert_frame_equal(empty_as, g_as.tail(0)) + assert_frame_equal(empty_as, g_as.head(-1)) + assert_frame_equal(empty_as, g_as.tail(-1)) + + assert_frame_equal(df_as, g_as.head(7)) # contains all + assert_frame_equal(df_as, g_as.tail(7)) + + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + def test_group_selection_cache(self): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + expected = df.iloc[[0, 2]].set_index('A') + + g = df.groupby('A') + result1 = g.head(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.tail(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.head(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.tail(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + def test_nth_empty(): # GH 16064 diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 4b821dade6eae..c0ea968ab0819 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -474,32 +474,41 @@ def test_cython_group_transform_algos(self): np.timedelta64(5, 'ns')]) tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) - def test_cython_transform(self): + @pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) + def test_cython_transform_series(self, op, args, targop): # GH 4095 - ops = [(('cumprod', - ()), lambda x: x.cumprod()), (('cumsum', ()), - lambda x: x.cumsum()), - (('shift', (-1, )), - lambda x: x.shift(-1)), (('shift', - (1, )), lambda x: x.shift())] - s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) # series - for (op, args), targop in ops: - for data in [s, s_missing]: - # print(data.head()) - expected = data.groupby(labels).transform(targop) - - tm.assert_series_equal(expected, - data.groupby(labels).transform(op, - *args)) - tm.assert_series_equal(expected, getattr( - data.groupby(labels), op)(*args)) - + for data in [s, s_missing]: + # print(data.head()) + expected = data.groupby(labels).transform(targop) + + tm.assert_series_equal( + expected, + data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr( + data.groupby(labels), op)(*args)) + + @pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) + def test_cython_transform_frame(self, op, args, targop): + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) strings = list('qwertyuiopasdfghjklz') strings_missing = strings[:] strings_missing[5] = np.nan @@ -530,34 +539,33 @@ def test_cython_transform(self): if op == 'shift': gb._set_group_selection() - for (op, args), targop in ops: - if op != 'shift' and 'int' not in gb_target: - # numeric apply fastpath promotes dtype so have - # to apply separately and concat - i = gb[['int']].apply(targop) - f = gb[['float', 'float_missing']].apply(targop) - expected = pd.concat([f, i], axis=1) + if op != 'shift' and 'int' not in gb_target: + # numeric apply fastpath promotes dtype so have + # to apply separately and concat + i = gb[['int']].apply(targop) + f = gb[['float', 'float_missing']].apply(targop) + expected = pd.concat([f, i], axis=1) + else: + expected = gb.apply(targop) + + expected = expected.sort_index(axis=1) + tm.assert_frame_equal(expected, + gb.transform(op, *args).sort_index( + axis=1)) + tm.assert_frame_equal(expected, getattr(gb, op)(*args)) + # individual columns + for c in df: + if c not in ['float', 'int', 'float_missing' + ] and op != 'shift': + pytest.raises(DataError, gb[c].transform, op) + pytest.raises(DataError, getattr(gb[c], op)) else: - expected = gb.apply(targop) - - expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, - gb.transform(op, *args).sort_index( - axis=1)) - tm.assert_frame_equal(expected, getattr(gb, op)(*args)) - # individual columns - for c in df: - if c not in ['float', 'int', 'float_missing' - ] and op != 'shift': - pytest.raises(DataError, gb[c].transform, op) - pytest.raises(DataError, getattr(gb[c], op)) - else: - expected = gb[c].apply(targop) - expected.name = c - tm.assert_series_equal(expected, - gb[c].transform(op, *args)) - tm.assert_series_equal(expected, - getattr(gb[c], op)(*args)) + expected = gb[c].apply(targop) + expected.name = c + tm.assert_series_equal(expected, + gb[c].transform(op, *args)) + tm.assert_series_equal(expected, + getattr(gb[c], op)(*args)) def test_transform_with_non_scalar_group(self): # GH 10165 diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index b70a03ec3a1d3..3d7977c63eeb6 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -1,3 +1,9 @@ +""" +these are systematically testing all of the args to value_counts +with different size combinations. This is to ensure stability of the sorting +and proper parameter handling +""" + import pytest from itertools import product @@ -7,55 +13,64 @@ from pandas import MultiIndex, DataFrame, Series, date_range -@pytest.mark.slow -@pytest.mark.parametrize("n,m", product((100, 1000), (5, 20))) -def test_series_groupby_value_counts(n, m): +# our starting frame +def seed_df(seed_nans, n, m): np.random.seed(1234) + days = date_range('2015-08-24', periods=10) - def rebuild_index(df): - arr = list(map(df.index.get_level_values, range(df.index.nlevels))) - df.index = MultiIndex.from_arrays(arr, names=df.index.names) - return df - - def check_value_counts(df, keys, bins): - for isort, normalize, sort, ascending, dropna \ - in product((False, True), repeat=5): + frame = DataFrame({ + '1st': np.random.choice( + list('abcd'), n), + '2nd': np.random.choice(days, n), + '3rd': np.random.randint(1, m + 1, n) + }) - kwargs = dict(normalize=normalize, sort=sort, - ascending=ascending, dropna=dropna, bins=bins) + if seed_nans: + frame.loc[1::11, '1st'] = np.nan + frame.loc[3::17, '2nd'] = np.nan + frame.loc[7::19, '3rd'] = np.nan + frame.loc[8::19, '3rd'] = np.nan + frame.loc[9::19, '3rd'] = np.nan - gr = df.groupby(keys, sort=isort) - left = gr['3rd'].value_counts(**kwargs) + return frame - gr = df.groupby(keys, sort=isort) - right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ['3rd'] - # have to sort on index because of unstable sort on values - left, right = map(rebuild_index, (left, right)) # xref GH9212 - tm.assert_series_equal(left.sort_index(), right.sort_index()) +# create input df, keys, and the bins +binned = [] +ids = [] +for seed_nans in [True, False]: + for n, m in product((100, 1000), (5, 20)): - def loop(df): + df = seed_df(seed_nans, n, m) bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) keys = '1st', '2nd', ('1st', '2nd') for k, b in product(keys, bins): - check_value_counts(df, k, b) + binned.append((df, k, b, n, m)) + ids.append("{}-{}-{}".format(k, n, m)) - days = date_range('2015-08-24', periods=10) - frame = DataFrame({ - '1st': np.random.choice( - list('abcd'), n), - '2nd': np.random.choice(days, n), - '3rd': np.random.randint(1, m + 1, n) - }) +@pytest.mark.slow +@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) +def test_series_groupby_value_counts(df, keys, bins, n, m): + + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + for isort, normalize, sort, ascending, dropna \ + in product((False, True), repeat=5): + + kwargs = dict(normalize=normalize, sort=sort, + ascending=ascending, dropna=dropna, bins=bins) - loop(frame) + gr = df.groupby(keys, sort=isort) + left = gr['3rd'].value_counts(**kwargs) - frame.loc[1::11, '1st'] = np.nan - frame.loc[3::17, '2nd'] = np.nan - frame.loc[7::19, '3rd'] = np.nan - frame.loc[8::19, '3rd'] = np.nan - frame.loc[9::19, '3rd'] = np.nan + gr = df.groupby(keys, sort=isort) + right = gr['3rd'].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ['3rd'] - loop(frame) + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index()) From 46d94163611ca8cf85879b8714326acc5dc9f2ac Mon Sep 17 00:00:00 2001 From: sfoo Date: Wed, 1 Nov 2017 06:53:18 -0400 Subject: [PATCH 09/44] BUG: DataFrame.groupby() interprets tuple as list of keys closes #17979 Author: sfoo Author: Jeff Reback Closes #17996 from GuessWhoSamFoo/groupby_tuples and squashes the following commits: afb00317a [Jeff Reback] TST: separate out grouping-type tests c52b2a88d [sfoo] Moved notes to 0.22; created is_axis_multiindex var - pending internal use fb52c1c97 [sfoo] Added whatsnew; checked match_axis_length 99ebc4ec2 [sfoo] Cast groupby tuple as list when multiindex --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/groupby.py | 6 ++++-- pandas/tests/groupby/test_grouping.py | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index e32b3ed268fc8..957e6d38a0f00 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -88,6 +88,7 @@ Bug Fixes ~~~~~~~~~ - Bug in ``pd.read_msgpack()`` with a non existent file is passed in Python 2 (:issue:`15296`) +- Bug in ``DataFrame.groupby`` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5c07033f5a68f..c94b4a0850fca 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2705,6 +2705,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, """ group_axis = obj._get_axis(axis) + is_axis_multiindex = isinstance(obj._info_axis, MultiIndex) # validate that the passed single level is compatible with the passed # axis of the object @@ -2765,7 +2766,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, elif isinstance(key, BaseGrouper): return key, [], obj - if not isinstance(key, (tuple, list)): + # when MultiIndex, allow tuple to be a key + if not isinstance(key, (tuple, list)) or \ + (isinstance(key, tuple) and is_axis_multiindex): keys = [key] match_axis_length = False else: @@ -2869,7 +2872,6 @@ def is_in_obj(gpr): # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) - return grouper, exclusions, obj diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 824c754a5d753..9e6de8749952f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -356,6 +356,25 @@ def test_multifunc_select_col_integer_cols(self): # it works! df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) + def test_groupby_multiindex_tuple(self): + # GH 17979 + df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], + columns=pd.MultiIndex.from_arrays( + [['a', 'b', 'b', 'c'], + [1, 1, 2, 2]])) + expected = df.groupby([('b', 1)]).groups + result = df.groupby(('b', 1)).groups + tm.assert_dict_equal(expected, result) + + df2 = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], + columns=pd.MultiIndex.from_arrays( + [['a', 'b', 'b', 'c'], + ['d', 'd', 'e', 'e']])) + df2.groupby([('b', 'd')]).groups + expected = df.groupby([('b', 'd')]).groups + result = df.groupby(('b', 'd')).groups + tm.assert_dict_equal(expected, result) + @pytest.mark.parametrize('sort', [True, False]) def test_groupby_level(self, sort): # GH 17537 From c8a604ef7d793d3d2ba5759ac1f37dac47d53423 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 1 Nov 2017 07:25:33 -0400 Subject: [PATCH 10/44] CLN: some lint issues --- pandas/core/groupby.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c94b4a0850fca..967685c4e11bf 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3599,9 +3599,9 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True): offset = 0 for b in new_blocks: - l = len(b.mgr_locs) - b.mgr_locs = indexer[offset:(offset + l)] - offset += l + loc = len(b.mgr_locs) + b.mgr_locs = indexer[offset:(offset + loc)] + offset += loc return new_items, new_blocks @@ -3640,7 +3640,7 @@ def aggregate(self, arg, *args, **kwargs): result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name) - except: + except Exception: result = self._aggregate_generic(arg, *args, **kwargs) if not self.as_index: @@ -4029,7 +4029,7 @@ def _choose_path(self, fast_path, slow_path, group): if (res_r[mask] == res_fast_r[mask]).all(): path = fast_path - except: + except Exception: pass return path, res @@ -4603,7 +4603,7 @@ def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool try: starts, ends = lib.generate_slices(self.slabels, self.ngroups) - except: + except Exception: # fails when all -1 return [], True From de7a065c47901781b5010f9930087d1779cb954c Mon Sep 17 00:00:00 2001 From: Liam Marshall Date: Wed, 1 Nov 2017 07:11:49 -0500 Subject: [PATCH 11/44] read_html(): rewinding [wip] (#18017) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/io/html.py | 12 ++++++++ pandas/tests/io/test_html.py | 50 +++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 957e6d38a0f00..c41da4d67afe5 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -107,7 +107,7 @@ Indexing I/O ^^^ -- +- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) - - diff --git a/pandas/io/html.py b/pandas/io/html.py index 6f98683a1bff1..e1636d8007345 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -742,6 +742,18 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): try: tables = p.parse_tables() except Exception as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, 'seekable') and io.seekable(): + io.seek(0) + elif hasattr(io, 'seekable') and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError('The flavor {} failed to parse your input. ' + 'Since you passed a non-rewindable file ' + 'object, we can\'t rewind it to try ' + 'another parser. Try read_html() with a ' + 'different flavor.'.format(flav)) + retained = caught else: break diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b029403435d6f..956f3c68eeb41 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -968,3 +968,53 @@ def test_importcheck_thread_safety(): while helper_thread1.is_alive() or helper_thread2.is_alive(): pass assert None is helper_thread1.err is helper_thread2.err + + +def test_parse_failure_unseekable(): + # Issue #17975 + _skip_if_no('lxml') + + class UnseekableStringIO(StringIO): + def seekable(self): + return False + + good = UnseekableStringIO(''' +
spam
eggs
''') + bad = UnseekableStringIO(''' +
spameggs
''') + + assert read_html(good) + assert read_html(bad, flavor='bs4') + + bad.seek(0) + + with pytest.raises(ValueError, + match='passed a non-rewindable file object'): + read_html(bad) + + +def test_parse_failure_rewinds(): + # Issue #17975 + _skip_if_no('lxml') + + class MockFile(object): + def __init__(self, data): + self.data = data + self.at_end = False + + def read(self, size=None): + data = '' if self.at_end else self.data + self.at_end = True + return data + + def seek(self, offset): + self.at_end = False + + def seekable(self): + return True + + good = MockFile('
spam
eggs
') + bad = MockFile('
spameggs
') + + assert read_html(good) + assert read_html(bad) From 7c0a3be6f01695ba06e8685e3ee91032940d07f0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Nov 2017 06:48:36 -0400 Subject: [PATCH 12/44] CI: temp disable scipy on windows 3.6 build (#18078) --- ci/requirements-3.6_WIN.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run index af7a90b126f22..5d6c074ec1f85 100644 --- a/ci/requirements-3.6_WIN.run +++ b/ci/requirements-3.6_WIN.run @@ -6,7 +6,7 @@ openpyxl xlsxwriter xlrd xlwt -scipy +# scipy feather-format numexpr pytables From 8844b2e6492ccb5829b6acda655b7c2767dbc65a Mon Sep 17 00:00:00 2001 From: Matt Braymer-Hayes Date: Thu, 2 Nov 2017 04:25:18 -0700 Subject: [PATCH 13/44] DOC: Remove duplicate 'in' from contributing.rst (#18040) (#18076) --- doc/source/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index e345f79dad5c2..1eb3a52e1b050 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -330,7 +330,7 @@ The utility script ``scripts/api_rst_coverage.py`` can be used to compare the list of methods documented in ``doc/source/api.rst`` (which is used to generate the `API Reference `_ page) and the actual public methods. -This will identify methods documented in in ``doc/source/api.rst`` that are not actually +This will identify methods documented in ``doc/source/api.rst`` that are not actually class methods, and existing methods that are not documented in ``doc/source/api.rst``. From 62695a204adeb76e08fbc3f89f43b4e9d3d731e7 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 2 Nov 2017 11:26:50 +0000 Subject: [PATCH 14/44] improve test output for Categoricals (#18069) --- doc/source/whatsnew/v0.21.1.txt | 3 ++- pandas/util/testing.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 25a891eab0e86..4adafe7c06450 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -117,7 +117,8 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) -- +- Error messages in the testing module have been improved when items have + different ``CategoricalDtype`` (:issue:`18069`) - Other diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 730d2782e85d2..dec67bbea854f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1074,8 +1074,12 @@ def assert_categorical_equal(left, right, check_dtype=True, def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) + elif is_categorical_dtype(left): + left = repr(left) if isinstance(right, np.ndarray): right = pprint_thing(right) + elif is_categorical_dtype(right): + right = repr(right) msg = """{obj} are different From 769120915afc6046bc99af762a33cab46909489d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 2 Nov 2017 04:28:09 -0700 Subject: [PATCH 15/44] MAINT: Remove np.array_equal calls in tests (#18047) --- pandas/tests/dtypes/test_cast.py | 10 +++---- pandas/tests/frame/test_constructors.py | 3 +- pandas/tests/frame/test_nonunique_indexes.py | 2 +- .../indexes/datetimes/test_date_range.py | 28 +++++++++---------- pandas/tests/io/parser/common.py | 2 +- pandas/tests/io/parser/test_textreader.py | 27 ++++++++++-------- pandas/tests/reshape/test_concat.py | 4 ++- pandas/tests/series/test_missing.py | 16 +++++++---- pandas/tests/test_algos.py | 21 ++++---------- pandas/tests/test_join.py | 11 ++++---- pandas/tests/test_lib.py | 2 +- pandas/tests/test_sorting.py | 11 ++++---- pandas/tests/tseries/test_timezones.py | 4 +-- 13 files changed, 72 insertions(+), 69 deletions(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index d9fb458c83529..82a35fa711e8c 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -38,17 +38,17 @@ def test_downcast_conv(self): arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') - assert (np.array_equal(result, arr)) + tm.assert_numpy_array_equal(result, arr) arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) + expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = maybe_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) + expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) # GH16875 coercing of bools ser = Series([True, True, False]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c55c79ef18602..8291e9d452348 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1913,10 +1913,11 @@ def test_from_records_len0_with_columns(self): # #2633 result = DataFrame.from_records([], index='foo', columns=['foo', 'bar']) + expected = Index(['bar']) - assert np.array_equal(result.columns, ['bar']) assert len(result) == 0 assert result.index.name == 'foo' + tm.assert_index_equal(result.columns, expected) def test_to_frame_with_falsey_names(self): # GH 16114 diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 4f77ba0ae1f5a..5b903c5a1eaf6 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -448,7 +448,7 @@ def test_as_matrix_duplicates(self): expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], dtype=object) - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_set_value_by_index(self): # See gh-12344 diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 3b40ef092f364..1fca0445de5c4 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -20,11 +20,6 @@ START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) -def eq_gen_range(kwargs, expected): - rng = generate_range(**kwargs) - assert (np.array_equal(list(rng), expected)) - - class TestDateRanges(TestData): def test_date_range_gen_error(self): @@ -201,20 +196,23 @@ def test_generate_cday(self): assert rng1 == rng2 def test_1(self): - eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), - [datetime(2009, 3, 25), datetime(2009, 3, 26)]) + rng = list(generate_range(start=datetime(2009, 3, 25), periods=2)) + expected = [datetime(2009, 3, 25), datetime(2009, 3, 26)] + assert rng == expected def test_2(self): - eq_gen_range(dict(start=datetime(2008, 1, 1), - end=datetime(2008, 1, 3)), - [datetime(2008, 1, 1), - datetime(2008, 1, 2), - datetime(2008, 1, 3)]) + rng = list(generate_range(start=datetime(2008, 1, 1), + end=datetime(2008, 1, 3))) + expected = [datetime(2008, 1, 1), + datetime(2008, 1, 2), + datetime(2008, 1, 3)] + assert rng == expected def test_3(self): - eq_gen_range(dict(start=datetime(2008, 1, 5), - end=datetime(2008, 1, 6)), - []) + rng = list(generate_range(start=datetime(2008, 1, 5), + end=datetime(2008, 1, 6))) + expected = [] + assert rng == expected def test_precision_finer_than_offset(self): # GH 9907 diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index e85d3ad294655..6a996213b28bb 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -823,7 +823,7 @@ def test_parse_integers_above_fp_precision(self): 17007000002000192, 17007000002000194]}) - assert np.array_equal(result['Numbers'], expected['Numbers']) + tm.assert_series_equal(result['Numbers'], expected['Numbers']) def test_chunks_have_consistent_numerical_type(self): integers = [str(i) for i in range(499999)] diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index c9088d2ecc5e7..f66f9ccf065f7 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -161,9 +161,9 @@ def test_skip_bad_lines(self): error_bad_lines=False, warn_bad_lines=False) result = reader.read() - expected = {0: ['a', 'd', 'g', 'l'], - 1: ['b', 'e', 'h', 'm'], - 2: ['c', 'f', 'i', 'n']} + expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object), + 1: np.array(['b', 'e', 'h', 'm'], dtype=object), + 2: np.array(['c', 'f', 'i', 'n'], dtype=object)} assert_array_dicts_equal(result, expected) reader = TextReader(StringIO(data), delimiter=':', @@ -189,8 +189,10 @@ def test_header_not_enough_lines(self): assert header == expected recs = reader.read() - expected = {0: [1, 4], 1: [2, 5], 2: [3, 6]} - assert_array_dicts_equal(expected, recs) + expected = {0: np.array([1, 4], dtype=np.int64), + 1: np.array([2, 5], dtype=np.int64), + 2: np.array([3, 6], dtype=np.int64)} + assert_array_dicts_equal(recs, expected) # not enough rows pytest.raises(parser.ParserError, TextReader, StringIO(data), @@ -203,14 +205,16 @@ def test_header_not_enough_lines_as_recarray(self): '1,2,3\n' '4,5,6') - reader = TextReader(StringIO(data), delimiter=',', header=2, - as_recarray=True) + reader = TextReader(StringIO(data), delimiter=',', + header=2, as_recarray=True) header = reader.header expected = [['a', 'b', 'c']] assert header == expected recs = reader.read() - expected = {'a': [1, 4], 'b': [2, 5], 'c': [3, 6]} + expected = {'a': np.array([1, 4], dtype=np.int64), + 'b': np.array([2, 5], dtype=np.int64), + 'c': np.array([3, 6], dtype=np.int64)} assert_array_dicts_equal(expected, recs) # not enough rows @@ -225,7 +229,7 @@ def test_escapechar(self): reader = TextReader(StringIO(data), delimiter=',', header=None, escapechar='\\') result = reader.read() - expected = {0: ['"hello world"'] * 3} + expected = {0: np.array(['"hello world"'] * 3, dtype=object)} assert_array_dicts_equal(result, expected) def test_eof_has_eol(self): @@ -360,7 +364,7 @@ def test_empty_field_eof(self): result = TextReader(StringIO(data), delimiter=',').read() - expected = {0: np.array([1, 4]), + expected = {0: np.array([1, 4], dtype=np.int64), 1: np.array(['2', ''], dtype=object), 2: np.array(['3', ''], dtype=object)} assert_array_dicts_equal(result, expected) @@ -397,4 +401,5 @@ def test_empty_csv_input(self): def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): - assert(np.array_equal(v, right[k])) + assert tm.assert_numpy_array_equal(np.asarray(v), + np.asarray(right[k])) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 75fcfaad75cef..c9c294e70e7b1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1594,7 +1594,9 @@ def test_concat_series_axis1_same_names_ignore_index(self): s2 = Series(randn(len(dates)), index=dates, name='value') result = concat([s1, s2], axis=1, ignore_index=True) - assert np.array_equal(result.columns, [0, 1]) + expected = Index([0, 1]) + + tm.assert_index_equal(result.columns, expected) def test_concat_iterables(self): from collections import deque, Iterable diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index bd4e8b23f31b4..5ca4eba4da13b 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -636,17 +636,21 @@ def test_valid(self): def test_isna(self): ser = Series([0, 5.4, 3, nan, -0.001]) - np.array_equal(ser.isna(), - Series([False, False, False, True, False]).values) + expected = Series([False, False, False, True, False]) + tm.assert_series_equal(ser.isna(), expected) + ser = Series(["hi", "", nan]) - np.array_equal(ser.isna(), Series([False, False, True]).values) + expected = Series([False, False, True]) + tm.assert_series_equal(ser.isna(), expected) def test_notna(self): ser = Series([0, 5.4, 3, nan, -0.001]) - np.array_equal(ser.notna(), - Series([True, True, True, False, True]).values) + expected = Series([True, True, True, False, True]) + tm.assert_series_equal(ser.notna(), expected) + ser = Series(["hi", "", nan]) - np.array_equal(ser.notna(), Series([True, True, False]).values) + expected = Series([True, True, False]) + tm.assert_series_equal(ser.notna(), expected) def test_pad_nan(self): x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'], diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index df9297312a6f3..6a5c0ae11abb7 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1132,19 +1132,19 @@ def test_pad_backfill_object_segfault(): result = libalgos.pad_object(old, new) expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.pad_object(new, old) expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.backfill_object(old, new) expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.backfill_object(new, old) expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_arrmap(): @@ -1235,15 +1235,6 @@ def test_is_lexsorted(): assert (not libalgos.is_lexsorted(failure)) -# def test_get_group_index(): -# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) -# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) -# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) - -# result = lib.get_group_index([a, b], (3, 4)) - -# assert(np.array_equal(result, expected)) - def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) @@ -1253,13 +1244,13 @@ def test_groupsort_indexer(): # need to use a stable sort expected = np.argsort(a, kind='mergesort') - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) # compare with lexsort key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_infinity_sort(): diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index cde1cab37d09c..af946436b55c7 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -53,7 +53,7 @@ def test_left_join_indexer_unique(): result = _join.left_join_indexer_unique_int64(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_left_outer_join_bug(): @@ -69,13 +69,14 @@ def test_left_outer_join_bug(): lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) - exp_lidx = np.arange(len(left)) - exp_ridx = -np.ones(len(left)) + exp_lidx = np.arange(len(left), dtype=np.int64) + exp_ridx = -np.ones(len(left), dtype=np.int64) + exp_ridx[left == 1] = 1 exp_ridx[left == 3] = 0 - assert (np.array_equal(lidx, exp_lidx)) - assert (np.array_equal(ridx, exp_ridx)) + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) def test_inner_join_indexer(): diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 2662720bb436d..75aa9aa4e8198 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -198,7 +198,7 @@ def test_get_reverse_indexer(self): indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) result = lib.get_reverse_indexer(indexer, 5) expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) class TestNAObj(object): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index a5b12bbf9608a..06c1fa1c0905a 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -332,16 +332,17 @@ def testit(label_list, shape): label_list2 = decons_group_index(group_index, shape) for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) + tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64)] testit(label_list, shape) shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + label_list = [np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5)] testit(label_list, shape) diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index aa8fe90ea6500..ddcf1bb7d8b7b 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -70,7 +70,7 @@ def test_utc_to_local_no_modify(self): rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) # Values are unmodified - assert np.array_equal(rng.asi8, rng_eastern.asi8) + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) assert self.cmptz(rng_eastern.tz, self.tz('US/Eastern')) @@ -108,7 +108,7 @@ def test_localize_utc_conversion_explicit(self): rng = date_range('3/10/2012', '3/11/2012', freq='30T') converted = rng.tz_localize(self.tz('US/Eastern')) expected_naive = rng + offsets.Hour(5) - assert np.array_equal(converted.asi8, expected_naive.asi8) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail rng = date_range('3/11/2012', '3/12/2012', freq='30T') From edad4766c9001459f53cabf7262e386da91794b4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Nov 2017 04:56:15 -0700 Subject: [PATCH 16/44] Move scalar arithmetic tests to tests.scalars (#18075) --- .../indexes/datetimes/test_arithmetic.py | 19 --- .../indexes/timedeltas/test_arithmetic.py | 123 ------------------ pandas/tests/scalar/test_timedelta.py | 85 ++++++++++++ pandas/tests/scalar/test_timestamp.py | 18 +++ 4 files changed, 103 insertions(+), 142 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 2f3d567599fa6..bf0217e9bf22a 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -199,25 +199,6 @@ def test_ufunc_coercions(self): tm.assert_index_equal(result, exp) assert result.freq == 'D' - def test_overflow_offset(self): - # xref https://github.com/statsmodels/statsmodels/issues/3374 - # ends up multiplying really large numbers which overflow - - t = Timestamp('2017-01-13 00:00:00', freq='D') - offset = 20169940 * pd.offsets.Day(1) - - def f(): - t + offset - pytest.raises(OverflowError, f) - - def f(): - offset + t - pytest.raises(OverflowError, f) - - def f(): - t - offset - pytest.raises(OverflowError, f) - # GH 10699 @pytest.mark.parametrize('klass,assert_func', zip([Series, DatetimeIndex], diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 9341cf2202f4c..bbc8dd6577b2c 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -51,44 +51,6 @@ def test_numeric_compat(self): pytest.raises(ValueError, lambda: idx * self._holder(np.arange(3))) pytest.raises(ValueError, lambda: idx * np.array([1, 2])) - # FIXME: duplicate. This came from `test_timedelta`, whereas the - # version above came from `test_astype`. Make sure there aren't more - # duplicates. - def test_numeric_compat__(self): - - idx = self._holder(np.arange(5, dtype='int64')) - didx = self._holder(np.arange(5, dtype='int64') ** 2) - result = idx * 1 - tm.assert_index_equal(result, idx) - - result = 1 * idx - tm.assert_index_equal(result, idx) - - result = idx / 1 - tm.assert_index_equal(result, idx) - - result = idx // 1 - tm.assert_index_equal(result, idx) - - result = idx * np.array(5, dtype='int64') - tm.assert_index_equal(result, - self._holder(np.arange(5, dtype='int64') * 5)) - - result = idx * np.arange(5, dtype='int64') - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='int64')) - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5, dtype='float64') + 0.1) - tm.assert_index_equal(result, self._holder(np.arange( - 5, dtype='float64') * (np.arange(5, dtype='float64') + 0.1))) - - # invalid - pytest.raises(TypeError, lambda: idx * idx) - pytest.raises(ValueError, lambda: idx * self._holder(np.arange(3))) - pytest.raises(ValueError, lambda: idx * np.array([1, 2])) - def test_ufunc_coercions(self): # normal ops are also tested in tseries/test_timedeltas.py idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], @@ -406,47 +368,6 @@ def test_addition_ops(self): expected = Timestamp('20130102') assert result == expected - # TODO: Split by op, better name - def test_ops(self): - td = Timedelta(10, unit='d') - assert -td == Timedelta(-10, unit='d') - assert +td == Timedelta(10, unit='d') - assert td - td == Timedelta(0, unit='ns') - assert (td - pd.NaT) is pd.NaT - assert td + td == Timedelta(20, unit='d') - assert (td + pd.NaT) is pd.NaT - assert td * 2 == Timedelta(20, unit='d') - assert (td * pd.NaT) is pd.NaT - assert td / 2 == Timedelta(5, unit='d') - assert td // 2 == Timedelta(5, unit='d') - assert abs(td) == td - assert abs(-td) == td - assert td / td == 1 - assert (td / pd.NaT) is np.nan - assert (td // pd.NaT) is np.nan - - # invert - assert -td == Timedelta('-10d') - assert td * -1 == Timedelta('-10d') - assert -1 * td == Timedelta('-10d') - assert abs(-td) == Timedelta('10d') - - # invalid multiply with another timedelta - pytest.raises(TypeError, lambda: td * td) - - # can't operate with integers - pytest.raises(TypeError, lambda: td + 2) - pytest.raises(TypeError, lambda: td - 2) - - def test_ops_offsets(self): - td = Timedelta(10, unit='d') - assert Timedelta(241, unit='h') == td + pd.offsets.Hour(1) - assert Timedelta(241, unit='h') == pd.offsets.Hour(1) + td - assert 240 == td / pd.offsets.Hour(1) - assert 1 / 240.0 == pd.offsets.Hour(1) / td - assert Timedelta(239, unit='h') == td - pd.offsets.Hour(1) - assert Timedelta(-239, unit='h') == pd.offsets.Hour(1) - td - def test_ops_ndarray(self): td = Timedelta('1 day') @@ -530,50 +451,6 @@ def test_ops_series_object(self): tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) - def test_ops_notimplemented(self): - class Other: - pass - - other = Other() - - td = Timedelta('1 day') - assert td.__add__(other) is NotImplemented - assert td.__sub__(other) is NotImplemented - assert td.__truediv__(other) is NotImplemented - assert td.__mul__(other) is NotImplemented - assert td.__floordiv__(other) is NotImplemented - - def test_timedelta_ops_scalar(self): - # GH 6808 - base = pd.to_datetime('20130101 09:01:12.123456') - expected_add = pd.to_datetime('20130101 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), - np.timedelta64(10, 's'), - np.timedelta64(10000000000, 'ns'), - pd.offsets.Second(10)]: - result = base + offset - assert result == expected_add - - result = base - offset - assert result == expected_sub - - base = pd.to_datetime('20130102 09:01:12.123456') - expected_add = pd.to_datetime('20130103 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta('1 day, 00:00:10'), - pd.to_timedelta('1 days, 00:00:10'), - timedelta(days=1, seconds=10), - np.timedelta64(1, 'D') + np.timedelta64(10, 's'), - pd.offsets.Day() + pd.offsets.Second(10)]: - result = base + offset - assert result == expected_add - - result = base - offset - assert result == expected_sub - def test_timedelta_ops_with_missing_values(self): # setup s1 = pd.to_timedelta(Series(['00:00:01'])) diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index d4434b3af385b..17c818779c76d 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -40,6 +40,91 @@ def test_to_timedelta_on_nanoseconds(self): pytest.raises(TypeError, lambda: Timedelta(nanoseconds='abc')) + def test_ops_notimplemented(self): + class Other: + pass + + other = Other() + + td = Timedelta('1 day') + assert td.__add__(other) is NotImplemented + assert td.__sub__(other) is NotImplemented + assert td.__truediv__(other) is NotImplemented + assert td.__mul__(other) is NotImplemented + assert td.__floordiv__(other) is NotImplemented + + def test_timedelta_ops_scalar(self): + # GH 6808 + base = pd.to_datetime('20130101 09:01:12.123456') + expected_add = pd.to_datetime('20130101 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), + np.timedelta64(10, 's'), + np.timedelta64(10000000000, 'ns'), + pd.offsets.Second(10)]: + result = base + offset + assert result == expected_add + + result = base - offset + assert result == expected_sub + + base = pd.to_datetime('20130102 09:01:12.123456') + expected_add = pd.to_datetime('20130103 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta('1 day, 00:00:10'), + pd.to_timedelta('1 days, 00:00:10'), + timedelta(days=1, seconds=10), + np.timedelta64(1, 'D') + np.timedelta64(10, 's'), + pd.offsets.Day() + pd.offsets.Second(10)]: + result = base + offset + assert result == expected_add + + result = base - offset + assert result == expected_sub + + def test_ops_offsets(self): + td = Timedelta(10, unit='d') + assert Timedelta(241, unit='h') == td + pd.offsets.Hour(1) + assert Timedelta(241, unit='h') == pd.offsets.Hour(1) + td + assert 240 == td / pd.offsets.Hour(1) + assert 1 / 240.0 == pd.offsets.Hour(1) / td + assert Timedelta(239, unit='h') == td - pd.offsets.Hour(1) + assert Timedelta(-239, unit='h') == pd.offsets.Hour(1) - td + + # TODO: Split by op, better name + def test_ops(self): + td = Timedelta(10, unit='d') + assert -td == Timedelta(-10, unit='d') + assert +td == Timedelta(10, unit='d') + assert td - td == Timedelta(0, unit='ns') + assert (td - pd.NaT) is pd.NaT + assert td + td == Timedelta(20, unit='d') + assert (td + pd.NaT) is pd.NaT + assert td * 2 == Timedelta(20, unit='d') + assert (td * pd.NaT) is pd.NaT + assert td / 2 == Timedelta(5, unit='d') + assert td // 2 == Timedelta(5, unit='d') + assert abs(td) == td + assert abs(-td) == td + assert td / td == 1 + assert (td / pd.NaT) is np.nan + assert (td // pd.NaT) is np.nan + + # invert + assert -td == Timedelta('-10d') + assert td * -1 == Timedelta('-10d') + assert -1 * td == Timedelta('-10d') + assert abs(-td) == Timedelta('10d') + + # invalid multiply with another timedelta + pytest.raises(TypeError, lambda: td * td) + + # can't operate with integers + pytest.raises(TypeError, lambda: td + 2) + pytest.raises(TypeError, lambda: td - 2) + class TestTimedeltas(object): _multiprocess_can_split_ = True diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index efee096797510..4cd9a2fadeb32 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -28,6 +28,24 @@ RESO_MS, RESO_SEC) +class TestTimestampArithmetic(object): + def test_overflow_offset(self): + # xref https://github.com/statsmodels/statsmodels/issues/3374 + # ends up multiplying really large numbers which overflow + + stamp = Timestamp('2017-01-13 00:00:00', freq='D') + offset = 20169940 * offsets.Day(1) + + with pytest.raises(OverflowError): + stamp + offset + + with pytest.raises(OverflowError): + offset + stamp + + with pytest.raises(OverflowError): + stamp - offset + + class TestTimestamp(object): def test_constructor(self): From bd958a17d7b3260ed7186ad65e62097ffd817893 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Nov 2017 06:58:52 -0500 Subject: [PATCH 17/44] Update Contributing Environment section (#18052) --- ci/environment-dev.yaml | 14 + ...ll.txt => requirements-optional-conda.txt} | 45 ++- ci/requirements-optional-pip.txt | 27 ++ ci/requirements_dev.txt | 14 +- doc/source/contributing.rst | 282 ++++++++---------- scripts/convert_deps.py | 29 ++ 6 files changed, 232 insertions(+), 179 deletions(-) create mode 100644 ci/environment-dev.yaml rename ci/{requirements_all.txt => requirements-optional-conda.txt} (68%) create mode 100644 ci/requirements-optional-pip.txt create mode 100644 scripts/convert_deps.py diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml new file mode 100644 index 0000000000000..c3d3d59f895c6 --- /dev/null +++ b/ci/environment-dev.yaml @@ -0,0 +1,14 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - Cython + - NumPy + - moto + - pytest + - python-dateutil + - python=3 + - pytz + - setuptools + - sphinx diff --git a/ci/requirements_all.txt b/ci/requirements-optional-conda.txt similarity index 68% rename from ci/requirements_all.txt rename to ci/requirements-optional-conda.txt index e13afd619f105..6edb8d17337e4 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements-optional-conda.txt @@ -1,28 +1,27 @@ -pytest>=3.1.0 -pytest-cov -pytest-xdist -flake8 -sphinx=1.5* -nbsphinx -ipython -python-dateutil -pytz -openpyxl -xlsxwriter -xlrd -xlwt -html5lib -patsy beautifulsoup4 -numpy -cython -scipy +blosc +bottleneck +fastparquet +feather-format +html5lib +ipython +ipykernel +jinja2 +lxml +matplotlib +nbsphinx numexpr +openpyxl +pyarrow +pymysql pytables -matplotlib +pytest-cov +pytest-xdist +s3fs +scipy seaborn -lxml sqlalchemy -bottleneck -pymysql -Jinja2 +xarray +xlrd +xlsxwriter +xlwt diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt new file mode 100644 index 0000000000000..06b22bd8f2c63 --- /dev/null +++ b/ci/requirements-optional-pip.txt @@ -0,0 +1,27 @@ +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directlybeautifulsoup4 +blosc +bottleneck +fastparquet +feather-format +html5lib +ipython +jinja2 +lxml +matplotlib +nbsphinx +numexpr +openpyxl +pyarrow +pymysql +tables +pytest-cov +pytest-xdist +s3fs +scipy +seaborn +sqlalchemy +xarray +xlrd +xlsxwriter +xlwt \ No newline at end of file diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index dbc4f6cbd6509..2fb36b7cd70d8 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -1,8 +1,10 @@ +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directly +Cython +NumPy +moto +pytest python-dateutil pytz -numpy -cython -pytest>=3.1.0 -pytest-cov -flake8 -moto +setuptools +sphinx \ No newline at end of file diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 1eb3a52e1b050..2a1aa3d0cf17a 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -11,32 +11,32 @@ Where to start? =============== All contributions, bug reports, bug fixes, documentation improvements, -enhancements and ideas are welcome. +enhancements, and ideas are welcome. -If you are simply looking to start working with the *pandas* codebase, navigate to the -`GitHub "issues" tab `_ and start looking through -interesting issues. There are a number of issues listed under `Docs +If you are brand new to pandas or open-source development, we recommend going +through the `GitHub "issues" tab `_ +to find issues that interest you. There are a number of issues listed under `Docs `_ and `Difficulty Novice `_ -where you could start out. - -Or maybe through using *pandas* you have an idea of your own or are looking for something -in the documentation and thinking 'this can be improved'...you can do something -about it! +where you could start out. Once you've found an interesting issue, you can +return here to get your development environment setup. Feel free to ask questions on the `mailing list -`_ or on `Gitter -`_. +`_ or on `Gitter`_. + +.. _contributing.bug_reports: Bug reports and enhancement requests ==================================== -Bug reports are an important part of making *pandas* more stable. Having a complete bug report -will allow others to reproduce the bug and provide insight into fixing. Because many versions of -*pandas* are supported, knowing version information will also identify improvements made since -previous versions. Trying the bug-producing code out on the *master* branch is often a worthwhile exercise -to confirm the bug still exists. It is also worth searching existing bug reports and pull requests +Bug reports are an important part of making *pandas* more stable. Having a complete bug report +will allow others to reproduce the bug and provide insight into fixing. See +`this stackoverflow article `_ for tips on +writing a good bug report. + +Trying the bug-producing code out on the *master* branch is often a worthwhile exercise +to confirm the bug still exists. It is also worth searching existing bug reports and pull requests to see if the issue has already been reported and/or fixed. Bug reports must: @@ -60,12 +60,16 @@ Bug reports must: The issue will then show up to the *pandas* community and be open to comments/ideas from others. +.. _contributing.github + Working with the code ===================== Now that you have an issue you want to fix, enhancement to add, or documentation to improve, you need to learn how to work with GitHub and the *pandas* code base. +.. _contributing.version_control: + Version control, Git, and GitHub -------------------------------- @@ -103,167 +107,164 @@ want to clone your fork to your machine:: git clone https://github.com/your-user-name/pandas.git pandas-yourname cd pandas-yourname - git remote add upstream git://github.com/pandas-dev/pandas.git + git remote add upstream https://github.com/pandas-dev/pandas.git This creates the directory `pandas-yourname` and connects your repository to the upstream (main project) *pandas* repository. -Creating a branch ------------------ +.. _contributing.dev_env: -You want your master branch to reflect only production-ready code, so create a -feature branch for making your changes. For example:: +Creating a development environment +---------------------------------- - git branch shiny-new-feature - git checkout shiny-new-feature +To test out code changes, you'll need to build pandas from source, which +requires a C compiler and python environment. If you're making documentation +changes, you can skip to :ref:`contributing.documentation` but you won't be able +to build the documentation locally before pushing your changes. -The above can be simplified to:: +.. _contributiong.dev_c: - git checkout -b shiny-new-feature +Installing a C Complier +~~~~~~~~~~~~~~~~~~~~~~~ -This changes your working directory to the shiny-new-feature branch. Keep any -changes in this branch specific to one bug or feature so it is clear -what the branch brings to *pandas*. You can have many shiny-new-features -and switch in between them using the git checkout command. +Pandas uses C extensions (mostly written using Cython) to speed up certain +operations. To install pandas from source, you need to compile these C +extensions, which means you need a C complier. This process depends on which +platform you're using. Follow the `CPython contributing guidelines +`_ for getting a +complier installed. You don't need to do any of the ``./configure`` or ``make`` +steps; you only need to install the complier. -To update this branch, you need to retrieve the changes from the master branch:: +For Windows developers, the following links may be helpful. - git fetch upstream - git rebase upstream/master +- https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ +- https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit +- https://cowboyprogrammer.org/building-python-wheels-for-windows/ +- https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ +- https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy -This will replay your commits on top of the latest pandas git master. If this -leads to merge conflicts, you must resolve these before submitting your pull -request. If you have uncommitted changes, you will need to ``stash`` them prior -to updating. This will effectively store your changes and they can be reapplied -after updating. +Let us know if you have any difficulties by opening an issue or reaching out on +`Gitter`_. -.. _contributing.dev_env: +.. _contributiong.dev_python: -Creating a development environment ----------------------------------- +Creating a Python Environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -An easy way to create a *pandas* development environment is as follows. +Now that you have a C complier, create an isolated pandas development +environment: -- Install either :ref:`Anaconda ` or :ref:`miniconda ` +- Install either `Anaconda `_ or `miniconda + `_ +- Make sure your conda is up to date (``conda update conda``) - Make sure that you have :ref:`cloned the repository ` - ``cd`` to the *pandas* source directory -Tell conda to create a new environment, named ``pandas_dev``, or any other name you would like -for this environment, by running:: - - conda create -n pandas_dev --file ci/requirements_dev.txt - - -For a python 3 environment:: - - conda create -n pandas_dev python=3 --file ci/requirements_dev.txt - -.. warning:: - - If you are on Windows, see :ref:`here for a fully compliant Windows environment `. - -This will create the new environment, and not touch any of your existing environments, -nor any existing python installation. It will install all of the basic dependencies of -*pandas*, as well as the development and testing tools. If you would like to install -other dependencies, you can install them as follows:: +We'll now kick off a three-step process: - conda install -n pandas_dev -c pandas pytables scipy +1. Install the build dependencies +2. Build and install pandas +3. Install the optional dependencies -To install *all* pandas dependencies you can do the following:: +.. code-block:: none - conda install -n pandas_dev -c conda-forge --file ci/requirements_all.txt + # Create and activate the build environment + conda env create -f ci/environment-dev.yaml + conda activate pandas-dev -To work in this environment, Windows users should ``activate`` it as follows:: + # Build and install pandas + python setup.py build_ext --inplace -j 4 + python -m pip install -e . - activate pandas_dev + # Install the rest of the optional dependencies + conda install -c defaults -c conda-forge --file=ci/requirements-optional-conda.txt -Mac OSX / Linux users should use:: +At this point you should be able to import pandas from your locally built version:: - source activate pandas_dev + $ python # start an interpreter + >>> import pandas + >>> print(pandas.__version__) + 0.22.0.dev0+29.g4ad6d4d74 -You will then see a confirmation message to indicate you are in the new development environment. +This will create the new environment, and not touch any of your existing environments, +nor any existing python installation. To view your environments:: conda info -e -To return to your home root environment in Windows:: - - deactivate +To return to your root environment:: -To return to your home root environment in OSX / Linux:: - - source deactivate + conda deactivate See the full conda docs `here `__. -At this point you can easily do an *in-place* install, as detailed in the next section. - -.. _contributing.windows: - -Creating a Windows development environment ------------------------------------------- +.. _contributing.pip: -To build on Windows, you need to have compilers installed to build the extensions. You will need to install the appropriate Visual Studio compilers, VS 2008 for Python 2.7, VS 2010 for 3.4, and VS 2015 for Python 3.5 and 3.6. +Creating a Python Environment (pip) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For Python 2.7, you can install the ``mingw`` compiler which will work equivalently to VS 2008:: +If you aren't using conda for you development environment, follow these instructions. +You'll need to have at least python3.5 installed on your system. - conda install -n pandas_dev libpython +.. code-block:: none -or use the `Microsoft Visual Studio VC++ compiler for Python `__. Note that you have to check the ``x64`` box to install the ``x64`` extension building capability as this is not installed by default. + # Create a virtual environment + # Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev + # Any parent directories should already exist + python3 -m venv ~/virtualenvs/pandas-dev + # Activate the virtulaenv + . ~/virtualenvs/pandas-dev/bin/activate -For Python 3.4, you can download and install the `Windows 7.1 SDK `__. Read the references below as there may be various gotchas during the installation. - -For Python 3.5 and 3.6, you can download and install the `Visual Studio 2015 Community Edition `__. - -Here are some references and blogs: - -- https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ -- https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit -- https://cowboyprogrammer.org/building-python-wheels-for-windows/ -- https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ -- https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy + # Install the build dependencies + python -m pip install -r ci/requirements_dev.txt + # Build and install pandas + python setup.py build_ext --inplace -j 4 + python -m pip install -e . -.. _contributing.getting_source: + # Install additional dependencies + python -m pip install -r ci/requirements-optional-pip.txt -Making changes --------------- +Creating a branch +----------------- -Before making your code changes, it is often necessary to build the code that was -just checked out. There are two primary methods of doing this. +You want your master branch to reflect only production-ready code, so create a +feature branch for making your changes. For example:: -#. The best way to develop *pandas* is to build the C extensions in-place by - running:: + git branch shiny-new-feature + git checkout shiny-new-feature - python setup.py build_ext --inplace +The above can be simplified to:: - If you startup the Python interpreter in the *pandas* source directory you - will call the built C extensions + git checkout -b shiny-new-feature -#. Another very common option is to do a ``develop`` install of *pandas*:: +This changes your working directory to the shiny-new-feature branch. Keep any +changes in this branch specific to one bug or feature so it is clear +what the branch brings to *pandas*. You can have many shiny-new-features +and switch in between them using the git checkout command. - python setup.py develop +To update this branch, you need to retrieve the changes from the master branch:: - This makes a symbolic link that tells the Python interpreter to import *pandas* - from your development directory. Thus, you can always be using the development - version on your system without being inside the clone directory. + git fetch upstream + git rebase upstream/master +This will replay your commits on top of the latest pandas git master. If this +leads to merge conflicts, you must resolve these before submitting your pull +request. If you have uncommitted changes, you will need to ``stash`` them prior +to updating. This will effectively store your changes and they can be reapplied +after updating. .. _contributing.documentation: Contributing to the documentation ================================= -If you're not the developer type, contributing to the documentation is still -of huge value. You don't even have to be an expert on -*pandas* to do so! Something as simple as rewriting small passages for clarity -as you reference the docs is a simple but effective way to contribute. The -next person to read that passage will be in your debt! - -In fact, there are sections of the docs that are worse off after being written -by experts. If something in the docs doesn't make sense to you, updating the -relevant section after you figure it out is a simple way to ensure it will -help the next person. +If you're not the developer type, contributing to the documentation is still of +huge value. You don't even have to be an expert on *pandas* to do so! In fact, +there are sections of the docs that are worse off after being written by +experts. If something in the docs doesn't make sense to you, updating the +relevant section after you figure it out is a great way to ensure it will help +the next person. .. contents:: Documentation: :local: @@ -342,30 +343,6 @@ Requirements First, you need to have a development environment to be able to build pandas (see the docs on :ref:`creating a development environment above `). -Further, to build the docs, there are some extra requirements: you will need to -have ``sphinx`` and ``ipython`` installed. `numpydoc -`_ is used to parse the docstrings that -follow the Numpy Docstring Standard (see above), but you don't need to install -this because a local copy of numpydoc is included in the *pandas* source -code. `nbsphinx `_ is required to build -the Jupyter notebooks included in the documentation. - -If you have a conda environment named ``pandas_dev``, you can install the extra -requirements with:: - - conda install -n pandas_dev sphinx ipython nbconvert nbformat - conda install -n pandas_dev -c conda-forge nbsphinx - -Furthermore, it is recommended to have all :ref:`optional dependencies `. -installed. This is not strictly necessary, but be aware that you will see some error -messages when building the docs. This happens because all the code in the documentation -is executed during the doc build, and so code examples using optional dependencies -will generate errors. Run ``pd.show_versions()`` to get an overview of the installed -version of all dependencies. - -.. warning:: - - You need to have ``sphinx`` version >= 1.3.2. Building the documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -386,10 +363,10 @@ If you want to do a full clean build, do:: python make.py clean python make.py html -Starting with *pandas* 0.13.1 you can tell ``make.py`` to compile only a single section -of the docs, greatly reducing the turn-around time for checking your changes. -You will be prompted to delete ``.rst`` files that aren't required. This is okay because -the prior versions of these files can be checked out from git. However, you must make sure +You can tell ``make.py`` to compile only a single section of the docs, greatly +reducing the turn-around time for checking your changes. You will be prompted to +delete ``.rst`` files that aren't required. This is okay because the prior +versions of these files can be checked out from git. However, you must make sure not to commit the file deletions to your Git repository! :: @@ -422,6 +399,8 @@ the documentation are also built by Travis-CI. These docs are then hosted `here `__, see also the :ref:`Continuous Integration ` section. +.. _contributing.code: + Contributing to the code base ============================= @@ -480,7 +459,7 @@ Once configured, you can run the tool as follows:: clang-format modified-c-file This will output what your file will look like if the changes are made, and to apply -them, just run the following command:: +them, run the following command:: clang-format -i modified-c-file @@ -1033,7 +1012,7 @@ delete your branch:: git checkout master git merge upstream/master -Then you can just do:: +Then you can do:: git branch -d shiny-new-feature @@ -1043,3 +1022,6 @@ branch has not actually been merged. The branch will still exist on GitHub, so to delete it there do:: git push origin --delete shiny-new-feature + + +.. _Gitter: https://gitter.im/pydata/pandas diff --git a/scripts/convert_deps.py b/scripts/convert_deps.py new file mode 100644 index 0000000000000..aabeb24a0c3c8 --- /dev/null +++ b/scripts/convert_deps.py @@ -0,0 +1,29 @@ +""" +Convert the conda environment.yaml to a pip requirements.txt +""" +import yaml + +exclude = {'python=3'} +rename = {'pytables': 'tables'} + +with open("ci/environment-dev.yaml") as f: + dev = yaml.load(f) + +with open("ci/requirements-optional-conda.txt") as f: + optional = [x.strip() for x in f.readlines()] + +required = dev['dependencies'] +required = [rename.get(dep, dep) for dep in required if dep not in exclude] +optional = [rename.get(dep, dep) for dep in optional if dep not in exclude] + + +with open("ci/requirements_dev.txt", 'wt') as f: + f.write("# This file was autogenerated by scripts/convert_deps.py\n") + f.write("# Do not modify directly\n") + f.write('\n'.join(required)) + + +with open("ci/requirements-optional-pip.txt", 'wt') as f: + f.write("# This file was autogenerated by scripts/convert_deps.py\n") + f.write("# Do not modify directly\n") + f.write("\n".join(optional)) From ef9a06c4bebebe6b294819782b66b0b24f85690f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Nov 2017 15:49:22 -0700 Subject: [PATCH 18/44] Index tests in the wrong places (#18074) --- pandas/tests/indexes/datetimes/test_astype.py | 33 +++- .../tests/indexes/datetimes/test_datetime.py | 83 +-------- pandas/tests/indexes/datetimes/test_ops.py | 43 ----- .../indexes/datetimes/test_partial_slicing.py | 62 +++++++ pandas/tests/indexes/datetimes/test_setops.py | 161 +++++++++++------- .../timedeltas/test_partial_slicing.py | 4 + .../indexes/timedeltas/test_timedelta.py | 6 - 7 files changed, 191 insertions(+), 201 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 46be24b90faae..0197fc4c52617 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -117,6 +117,15 @@ def test_astype_datetime64(self): dtype='datetime64[ns]') tm.assert_index_equal(result, expected) + def test_astype_object(self): + rng = date_range('1/1/2000', periods=20) + + casted = rng.astype('O') + exp_values = list(rng) + + tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) + assert casted.tolist() == exp_values + def test_astype_raises(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) @@ -287,12 +296,18 @@ def test_to_period_tz_dateutil(self): assert result == expected tm.assert_index_equal(ts.to_period(), xp) - def test_astype_object(self): - # NumPy 1.6.1 weak ns support - rng = date_range('1/1/2000', periods=20) - - casted = rng.astype('O') - exp_values = list(rng) - - tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) - assert casted.tolist() == exp_values + def test_to_period_nofreq(self): + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + pytest.raises(ValueError, idx.to_period) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], + freq='infer') + assert idx.freqstr == 'D' + expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', + '2000-01-03'], freq='D') + tm.assert_index_equal(idx.to_period(), expected) + + # GH 7606 + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + assert idx.freqstr is None + tm.assert_index_equal(idx.to_period(), expected) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 88bf8a4024112..cc6eeb44c99c9 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -8,10 +8,10 @@ import pandas.util.testing as tm from pandas.compat import lrange from pandas.compat.numpy import np_datetime64_compat -from pandas import (DatetimeIndex, Index, date_range, Series, DataFrame, +from pandas import (DatetimeIndex, Index, date_range, DataFrame, Timestamp, datetime, offsets) -from pandas.util.testing import assert_series_equal, assert_almost_equal +from pandas.util.testing import assert_almost_equal randn = np.random.randn @@ -223,22 +223,6 @@ def test_append_join_nondatetimeindex(self): # it works rng.join(idx, how='outer') - def test_to_period_nofreq(self): - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) - pytest.raises(ValueError, idx.to_period) - - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], - freq='infer') - assert idx.freqstr == 'D' - expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', - '2000-01-03'], freq='D') - tm.assert_index_equal(idx.to_period(), expected) - - # GH 7606 - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) - assert idx.freqstr is None - tm.assert_index_equal(idx.to_period(), expected) - def test_comparisons_coverage(self): rng = date_range('1/1/2000', periods=10) @@ -567,13 +551,6 @@ def test_does_not_convert_mixed_integer(self): assert cols.dtype == joined.dtype tm.assert_numpy_array_equal(cols.values, joined.values) - def test_slice_keeps_name(self): - # GH4226 - st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles') - et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles') - dr = pd.date_range(st, et, freq='H', name='timebucket') - assert dr[1:].name == dr.name - def test_join_self(self): index = date_range('1/1/2000', periods=10) kinds = 'outer', 'inner', 'left', 'right' @@ -687,59 +664,3 @@ def test_factorize_dst(self): arr, res = obj.factorize() tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) tm.assert_index_equal(res, idx) - - def test_slice_with_negative_step(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) - SLC = pd.IndexSlice - - def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(ts[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - - assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) - assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) - - assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1]) - - assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp( - '2014-10-01'):-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1], - SLC[13:8:-1]) - - assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0]) - - def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) - tm.assert_raises_regex(ValueError, 'slice step cannot be zero', - lambda: ts[::0]) - tm.assert_raises_regex(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - tm.assert_raises_regex(ValueError, 'slice step cannot be zero', - lambda: ts.loc[::0]) - - def test_slice_bounds_empty(self): - # GH 14354 - empty_idx = DatetimeIndex(freq='1H', periods=0, end='2015') - - right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') - exp = Timestamp('2015-01-02 23:59:59.999999999') - assert right == exp - - left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') - exp = Timestamp('2015-01-02 00:00:00') - assert left == exp - - def test_slice_duplicate_monotonic(self): - # https://github.com/pandas-dev/pandas/issues/16515 - idx = pd.DatetimeIndex(['2017', '2017']) - result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc') - expected = Timestamp('2017-01-01') - assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 14217ae291a4c..6e66e4a36f905 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -383,49 +383,6 @@ def test_resolution(self): tz=tz) assert idx.resolution == expected - def test_union(self): - for tz in self.tz: - # union - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) - - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) - - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: - - result_union = rng.union(other) - tm.assert_index_equal(result_union, expected) - - def test_difference(self): - for tz in self.tz: - # diff - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=3, tz=tz) - - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: - result_diff = rng.difference(other) - tm.assert_index_equal(result_diff, expected) - def test_comp_nat(self): left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index e7d03aa193cbd..50ee88bd82f40 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -12,6 +12,68 @@ class TestSlicing(object): + def test_slice_keeps_name(self): + # GH4226 + st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles') + et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles') + dr = pd.date_range(st, et, freq='H', name='timebucket') + assert dr[1:].name == dr.name + + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), + date_range('2014-01-01', periods=20, freq='MS')) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) + assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) + + assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1]) + + assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1], + SLC[13:8:-1]) + assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp( + '2014-10-01'):-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1], + SLC[13:8:-1]) + assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1], + SLC[13:8:-1]) + + assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), + date_range('2014-01-01', periods=20, freq='MS')) + tm.assert_raises_regex(ValueError, 'slice step cannot be zero', + lambda: ts[::0]) + tm.assert_raises_regex(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + tm.assert_raises_regex(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + + def test_slice_bounds_empty(self): + # GH 14354 + empty_idx = DatetimeIndex(freq='1H', periods=0, end='2015') + + right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') + exp = Timestamp('2015-01-02 23:59:59.999999999') + assert right == exp + + left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') + exp = Timestamp('2015-01-02 00:00:00') + assert left == exp + + def test_slice_duplicate_monotonic(self): + # https://github.com/pandas-dev/pandas/issues/16515 + idx = pd.DatetimeIndex(['2017', '2017']) + result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc') + expected = Timestamp('2017-01-01') + assert result == expected def test_slice_year(self): dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index ff436e0501849..5df75338d01d7 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -1,5 +1,6 @@ from datetime import datetime +import pytest import numpy as np import pandas as pd @@ -11,14 +12,30 @@ START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) -class TestDatetimeIndex(object): +class TestDatetimeIndexSetOps(object): + tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', + 'dateutil/US/Pacific'] - def test_union(self): - i1 = Int64Index(np.arange(0, 20, 2)) - i2 = Int64Index(np.arange(10, 30, 2)) - result = i1.union(i2) - expected = Int64Index(np.arange(0, 30, 2)) - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("tz", tz) + def test_union(self, tz): + rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + + rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + + rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + for rng, other, expected in [(rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3)]: + + result_union = rng.union(other) + tm.assert_index_equal(result_union, expected) def test_union_coverage(self): idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) @@ -83,62 +100,62 @@ def test_union_with_DatetimeIndex(self): i1.union(i2) # Works i2.union(i1) # Fails with "AttributeError: can't set attribute" - def test_intersection(self): + @pytest.mark.parametrize("tz", [None, 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific']) + def test_intersection(self, tz): # GH 4690 (with tz) - for tz in [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: - base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') - - # if target has the same name, it is preserved - rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx') - expected2 = date_range('6/1/2000', '6/20/2000', freq='D', - name='idx') - - # if target name is different, it will be reset - rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other') - expected3 = date_range('6/1/2000', '6/20/2000', freq='D', - name=None) - - rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = DatetimeIndex([], name='idx') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: - result = base.intersection(rng) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - # non-monotonic - base = DatetimeIndex(['2011-01-05', '2011-01-04', - '2011-01-02', '2011-01-03'], - tz=tz, name='idx') - - rng2 = DatetimeIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - tz=tz, name='idx') - expected2 = DatetimeIndex( - ['2011-01-04', '2011-01-02'], tz=tz, name='idx') - - rng3 = DatetimeIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - tz=tz, name='other') - expected3 = DatetimeIndex( - ['2011-01-04', '2011-01-02'], tz=tz, name=None) - - # GH 7880 - rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz, - name='idx') - expected4 = DatetimeIndex([], tz=tz, name='idx') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: - result = base.intersection(rng) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq is None - assert result.tz == expected.tz - + base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') + + # if target has the same name, it is preserved + rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx') + expected2 = date_range('6/1/2000', '6/20/2000', freq='D', name='idx') + + # if target name is different, it will be reset + rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other') + expected3 = date_range('6/1/2000', '6/20/2000', freq='D', name=None) + + rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx') + expected4 = DatetimeIndex([], name='idx') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), + (rng4, expected4)]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + # non-monotonic + base = DatetimeIndex(['2011-01-05', '2011-01-04', + '2011-01-02', '2011-01-03'], + tz=tz, name='idx') + + rng2 = DatetimeIndex(['2011-01-04', '2011-01-02', + '2011-02-02', '2011-02-03'], + tz=tz, name='idx') + expected2 = DatetimeIndex(['2011-01-04', '2011-01-02'], + tz=tz, name='idx') + + rng3 = DatetimeIndex(['2011-01-04', '2011-01-02', + '2011-02-02', '2011-02-03'], + tz=tz, name='other') + expected3 = DatetimeIndex(['2011-01-04', '2011-01-02'], + tz=tz, name=None) + + # GH 7880 + rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz, + name='idx') + expected4 = DatetimeIndex([], tz=tz, name='idx') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), + (rng4, expected4)]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq is None + assert result.tz == expected.tz + + def test_intersection_empty(self): # empty same freq GH2129 rng = date_range('6/1/2000', '6/15/2000', freq='T') result = rng[0:0].intersection(rng) @@ -155,6 +172,26 @@ def test_intersection_bug_1708(self): result = index_1 & index_2 assert len(result) == 0 + @pytest.mark.parametrize("tz", tz) + def test_difference(self, tz): + rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + expected1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + expected2 = pd.date_range('1/1/2000', freq='D', periods=3, tz=tz) + + rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + for rng, other, expected in [(rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3)]: + result_diff = rng.difference(other) + tm.assert_index_equal(result_diff, expected) + def test_difference_freq(self): # GH14323: difference of DatetimeIndex should not preserve frequency diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py index 8e5eae2a7a3ef..7c5f82193da6d 100644 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -9,6 +9,10 @@ class TestSlicing(object): + def test_slice_keeps_name(self): + # GH4226 + dr = pd.timedelta_range('1d', '5d', freq='H', name='timebucket') + assert dr[1:].name == dr.name def test_partial_slice(self): rng = timedelta_range('1 day 10:11:12', freq='h', periods=500) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 2683110f2f02e..615c0d0ffa210 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -233,12 +233,6 @@ def test_join_self(self): joined = index.join(index, how=kind) tm.assert_index_equal(index, joined) - def test_slice_keeps_name(self): - - # GH4226 - dr = pd.timedelta_range('1d', '5d', freq='H', name='timebucket') - assert dr[1:].name == dr.name - def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: randn(), From ba279c011580b0fb363fe79f65fe9f31a1c28ad6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Nov 2017 16:49:54 -0700 Subject: [PATCH 19/44] Move comparison utilities to np_datetime; (#18080) --- pandas/_libs/tslib.pyx | 35 +++++------------------------ pandas/_libs/tslibs/np_datetime.pxd | 4 ++++ pandas/_libs/tslibs/np_datetime.pyx | 31 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d2492064c900c..8a882a465f9f7 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -55,6 +55,8 @@ from datetime cimport ( from datetime import time as datetime_time from tslibs.np_datetime cimport (check_dts_bounds, + reverse_ops, + cmp_scalar, pandas_datetimestruct, dt64_to_dtstruct, dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64) @@ -893,31 +895,6 @@ def unique_deltas(ndarray[int64_t] arr): return result -cdef inline bint _cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1: - if op == Py_EQ: - return lhs == rhs - elif op == Py_NE: - return lhs != rhs - elif op == Py_LT: - return lhs < rhs - elif op == Py_LE: - return lhs <= rhs - elif op == Py_GT: - return lhs > rhs - elif op == Py_GE: - return lhs >= rhs - - -cdef int _reverse_ops[6] - -_reverse_ops[Py_LT] = Py_GT -_reverse_ops[Py_LE] = Py_GE -_reverse_ops[Py_EQ] = Py_EQ -_reverse_ops[Py_NE] = Py_NE -_reverse_ops[Py_GT] = Py_LT -_reverse_ops[Py_GE] = Py_LE - - cdef str _NDIM_STRING = "ndim" # This is PITA. Because we inherit from datetime, which has very specific @@ -970,7 +947,7 @@ cdef class _Timestamp(datetime): raise TypeError('Cannot compare type %r with type %r' % (type(self).__name__, type(other).__name__)) - return PyObject_RichCompare(other, self, _reverse_ops[op]) + return PyObject_RichCompare(other, self, reverse_ops[op]) else: if op == Py_EQ: return False @@ -980,7 +957,7 @@ cdef class _Timestamp(datetime): (type(self).__name__, type(other).__name__)) self._assert_tzawareness_compat(other) - return _cmp_scalar(self.value, ots.value, op) + return cmp_scalar(self.value, ots.value, op) def __reduce_ex__(self, protocol): # python 3.6 compat @@ -2066,7 +2043,7 @@ cdef class _Timedelta(timedelta): type(other).__name__)) if util.is_array(other): return PyObject_RichCompare(np.array([self]), other, op) - return PyObject_RichCompare(other, self, _reverse_ops[op]) + return PyObject_RichCompare(other, self, reverse_ops[op]) else: if op == Py_EQ: return False @@ -2075,7 +2052,7 @@ cdef class _Timedelta(timedelta): raise TypeError('Cannot compare type %r with type %r' % (type(self).__name__, type(other).__name__)) - return _cmp_scalar(self.value, ots.value, op) + return cmp_scalar(self.value, ots.value, op) def _ensure_components(_Timedelta self): """ diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 0e6eda0c88beb..ab77049a9ff5b 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -12,6 +12,10 @@ cdef extern from "../src/datetime/np_datetime.h": int32_t month, day, hour, min, sec, us, ps, as +cdef int reverse_ops[6] + +cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1 + cdef check_dts_bounds(pandas_datetimestruct *dts) cdef int64_t dtstruct_to_dt64(pandas_datetimestruct* dts) nogil diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 217cde2aad677..1c635e6cecc13 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- # cython: profile=False +from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE + from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT, PyDateTime_GET_YEAR, PyDateTime_GET_MONTH, @@ -47,6 +49,35 @@ cdef extern from "../src/datetime/np_datetime.h": pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS # ---------------------------------------------------------------------- +# Comparison + +cdef int reverse_ops[6] + +reverse_ops[Py_LT] = Py_GT +reverse_ops[Py_LE] = Py_GE +reverse_ops[Py_EQ] = Py_EQ +reverse_ops[Py_NE] = Py_NE +reverse_ops[Py_GT] = Py_LT +reverse_ops[Py_GE] = Py_LE + + +cdef inline bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1: + """ + cmp_scalar is a more performant version of PyObject_RichCompare + typed for int64_t arguments. + """ + if op == Py_EQ: + return lhs == rhs + elif op == Py_NE: + return lhs != rhs + elif op == Py_LT: + return lhs < rhs + elif op == Py_LE: + return lhs <= rhs + elif op == Py_GT: + return lhs > rhs + elif op == Py_GE: + return lhs >= rhs class OutOfBoundsDatetime(ValueError): From 2a31f7b759140f936c665a6ba88bc496e1d31edd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Nov 2017 16:54:44 -0700 Subject: [PATCH 20/44] Separate _TSObject into conversion (#18060) --- pandas/_libs/tslib.pxd | 3 +- pandas/_libs/tslib.pyx | 233 +--------------------- pandas/_libs/tslibs/conversion.pxd | 15 +- pandas/_libs/tslibs/conversion.pyx | 310 ++++++++++++++++++++++++++++- 4 files changed, 324 insertions(+), 237 deletions(-) diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd index 5ceff32cfbac7..443b3867eb2b5 100644 --- a/pandas/_libs/tslib.pxd +++ b/pandas/_libs/tslib.pxd @@ -1,6 +1,7 @@ from numpy cimport ndarray, int64_t -cdef convert_to_tsobject(object, object, object, bint, bint) +from tslibs.conversion cimport convert_to_tsobject + cpdef convert_to_timedelta64(object, object) cdef bint _check_all_nulls(obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8a882a465f9f7..d87a6b958a07b 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -96,7 +96,10 @@ from tslibs.timezones cimport ( from tslibs.fields import ( get_date_name_field, get_start_end_field, get_date_field, build_field_sarray) -from tslibs.conversion cimport tz_convert_single, _TSObject, _localize_tso +from tslibs.conversion cimport (tz_convert_single, _TSObject, + convert_to_tsobject, + convert_datetime_to_tsobject, + get_datetime64_nanos) from tslibs.conversion import ( tz_localize_to_utc, tz_convert, tz_convert_single) @@ -1212,215 +1215,6 @@ cdef inline bint is_timestamp(object o): return Py_TYPE(o) == ts_type # isinstance(o, Timestamp) -# helper to extract datetime and int64 from several different possibilities -cdef convert_to_tsobject(object ts, object tz, object unit, - bint dayfirst, bint yearfirst): - """ - Extract datetime and int64 from any of: - - np.int64 (with unit providing a possible modifier) - - np.datetime64 - - a float (with unit providing a possible modifier) - - python int or long object (with unit providing a possible modifier) - - iso8601 string object - - python datetime object - - another timestamp object - """ - cdef: - _TSObject obj - - if tz is not None: - tz = maybe_get_tz(tz) - - obj = _TSObject() - - if is_string_object(ts): - return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) - - if ts is None or ts is NaT: - obj.value = NPY_NAT - elif is_datetime64_object(ts): - if ts.view('i8') == NPY_NAT: - obj.value = NPY_NAT - else: - obj.value = _get_datetime64_nanos(ts) - dt64_to_dtstruct(obj.value, &obj.dts) - elif is_integer_object(ts): - if ts == NPY_NAT: - obj.value = NPY_NAT - else: - ts = ts * cast_from_unit(None, unit) - obj.value = ts - dt64_to_dtstruct(ts, &obj.dts) - elif is_float_object(ts): - if ts != ts or ts == NPY_NAT: - obj.value = NPY_NAT - else: - ts = cast_from_unit(ts, unit) - obj.value = ts - dt64_to_dtstruct(ts, &obj.dts) - elif PyDateTime_Check(ts): - return convert_datetime_to_tsobject(ts, tz) - elif PyDate_Check(ts): - # Keep the converter same as PyDateTime's - ts = datetime.combine(ts, datetime_time()) - return convert_datetime_to_tsobject(ts, tz) - elif getattr(ts, '_typ', None) == 'period': - raise ValueError("Cannot convert Period to Timestamp " - "unambiguously. Use to_timestamp") - else: - raise TypeError('Cannot convert input [{}] of type {} to ' - 'Timestamp'.format(ts, type(ts))) - - if obj.value != NPY_NAT: - check_dts_bounds(&obj.dts) - - if tz is not None: - _localize_tso(obj, tz) - - return obj - - -cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, - int32_t nanos=0): - """ - Convert a datetime (or Timestamp) input `ts`, along with optional timezone - object `tz` to a _TSObject. - - The optional argument `nanos` allows for cases where datetime input - needs to be supplemented with higher-precision information. - - Parameters - ---------- - ts : datetime or Timestamp - Value to be converted to _TSObject - tz : tzinfo or None - timezone for the timezone-aware output - nanos : int32_t, default is 0 - nanoseconds supplement the precision of the datetime input ts - - Returns - ------- - obj : _TSObject - """ - cdef: - _TSObject obj = _TSObject() - - if tz is not None: - tz = maybe_get_tz(tz) - - # sort of a temporary hack - if ts.tzinfo is not None: - if hasattr(tz, 'normalize') and hasattr(ts.tzinfo, '_utcoffset'): - ts = tz.normalize(ts) - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: - # tzoffset - try: - tz = ts.astimezone(tz).tzinfo - except: - pass - obj.value = pydatetime_to_dt64(ts, &obj.dts) - ts_offset = get_utcoffset(ts.tzinfo, ts) - obj.value -= int(ts_offset.total_seconds() * 1e9) - tz_offset = get_utcoffset(tz, ts) - obj.value += int(tz_offset.total_seconds() * 1e9) - dt64_to_dtstruct(obj.value, &obj.dts) - obj.tzinfo = tz - elif not is_utc(tz): - ts = _localize_pydatetime(ts, tz) - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: - # UTC - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = pytz.utc - else: - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - - if obj.tzinfo is not None and not is_utc(obj.tzinfo): - offset = get_utcoffset(obj.tzinfo, ts) - obj.value -= int(offset.total_seconds() * 1e9) - - if is_timestamp(ts): - obj.value += ts.nanosecond - obj.dts.ps = ts.nanosecond * 1000 - - if nanos: - obj.value += nanos - obj.dts.ps = nanos * 1000 - - check_dts_bounds(&obj.dts) - return obj - - -cdef convert_str_to_tsobject(object ts, object tz, object unit, - bint dayfirst=False, bint yearfirst=False): - """ ts must be a string """ - - cdef: - _TSObject obj - int out_local = 0, out_tzoffset = 0 - datetime dt - - if tz is not None: - tz = maybe_get_tz(tz) - - obj = _TSObject() - - assert is_string_object(ts) - - if len(ts) == 0 or ts in nat_strings: - ts = NaT - elif ts == 'now': - # Issue 9000, we short-circuit rather than going - # into np_datetime_strings which returns utc - ts = datetime.now(tz) - elif ts == 'today': - # Issue 9000, we short-circuit rather than going - # into np_datetime_strings which returns a normalized datetime - ts = datetime.now(tz) - # equiv: datetime.today().replace(tzinfo=tz) - else: - try: - _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) - obj.value = dtstruct_to_dt64(&obj.dts) - check_dts_bounds(&obj.dts) - if out_local == 1: - obj.tzinfo = pytz.FixedOffset(out_tzoffset) - obj.value = tz_convert_single(obj.value, obj.tzinfo, 'UTC') - if tz is None: - check_dts_bounds(&obj.dts) - return obj - else: - # Keep the converter same as PyDateTime's - obj = convert_to_tsobject(obj.value, obj.tzinfo, - None, 0, 0) - dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, - obj.dts.hour, obj.dts.min, obj.dts.sec, - obj.dts.us, obj.tzinfo) - obj = convert_datetime_to_tsobject(dt, tz, - nanos=obj.dts.ps / 1000) - return obj - - else: - ts = obj.value - if tz is not None: - # shift for _localize_tso - ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, - ambiguous='raise', - errors='raise')[0] - except ValueError: - try: - ts = parse_datetime_string(ts, dayfirst=dayfirst, - yearfirst=yearfirst) - except Exception: - raise ValueError("could not convert string to Timestamp") - - return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) - - def _test_parse_iso8601(object ts): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used @@ -1841,7 +1635,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT else: try: - iresult[i] = _get_datetime64_nanos(val) + iresult[i] = get_datetime64_nanos(val) seen_datetime = 1 except ValueError: if is_coerce: @@ -2779,23 +2573,6 @@ cpdef int64_t _delta_to_nanoseconds(delta) except? -1: delta.microseconds) * 1000 -cdef inline _get_datetime64_nanos(object val): - cdef: - pandas_datetimestruct dts - PANDAS_DATETIMEUNIT unit - npy_datetime ival - - unit = get_datetime64_unit(val) - ival = get_datetime64_value(val) - - if unit != PANDAS_FR_ns: - pandas_datetime_to_datetimestruct(ival, unit, &dts) - check_dts_bounds(&dts) - return dtstruct_to_dt64(&dts) - else: - return ival - - def cast_to_nanoseconds(ndarray arr): cdef: Py_ssize_t i, n = arr.size diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index a042ee8949192..843a688a2630c 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,9 +1,11 @@ # -*- coding: utf-8 -*- # cython: profile=False -from numpy cimport int64_t +from cpython.datetime cimport datetime -from datetime cimport pandas_datetimestruct +from numpy cimport int64_t, int32_t + +from np_datetime cimport pandas_datetimestruct cdef class _TSObject: @@ -12,6 +14,15 @@ cdef class _TSObject: int64_t value # numpy dt64 object tzinfo + +cdef convert_to_tsobject(object ts, object tz, object unit, + bint dayfirst, bint yearfirst) + +cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, + int32_t nanos=*) + cdef void _localize_tso(_TSObject obj, object tz) cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2) + +cdef int64_t get_datetime64_nanos(object val) except? -1 diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 478d3bba80b00..61efc865112a9 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -6,23 +6,41 @@ from cython cimport Py_ssize_t import numpy as np cimport numpy as np -from numpy cimport int64_t, ndarray +from numpy cimport int64_t, int32_t, ndarray np.import_array() import pytz -from cpython.datetime cimport datetime +# stdlib datetime imports +from datetime import time as datetime_time +from cpython.datetime cimport (datetime, tzinfo, + PyDateTime_Check, PyDate_Check, + PyDateTime_CheckExact, PyDateTime_IMPORT) +PyDateTime_IMPORT from np_datetime cimport (check_dts_bounds, pandas_datetimestruct, - dt64_to_dtstruct, dtstruct_to_dt64) + dt64_to_dtstruct, dtstruct_to_dt64, + pydatetime_to_dt64) + +from datetime cimport (pandas_datetime_to_datetimestruct, + PANDAS_DATETIMEUNIT, PANDAS_FR_ns, npy_datetime, + _string_to_dts, + get_datetime64_unit, get_datetime64_value) cimport util +from util cimport (is_string_object, + is_datetime64_object, + is_integer_object, is_float_object) +from timedeltas cimport cast_from_unit from timezones cimport ( is_utc, is_tzlocal, is_fixed_offset, treat_tz_as_dateutil, treat_tz_as_pytz, - get_utcoffset, get_dst_info, get_timezone) + get_utcoffset, get_dst_info, get_timezone, maybe_get_tz) +from parsing import parse_datetime_string + +from nattype import nat_strings, NaT # ---------------------------------------------------------------------- # Constants @@ -32,6 +50,30 @@ cdef int64_t DAY_NS = 86400000000000LL UTC = pytz.UTC +# ---------------------------------------------------------------------- +# Misc Helpers + + +# TODO: How to declare np.datetime64 as the input type? +cdef inline int64_t get_datetime64_nanos(object val) except? -1: + """ + Extract the value and unit from a np.datetime64 object, then convert the + value to nanoseconds if necessary. + """ + cdef: + pandas_datetimestruct dts + PANDAS_DATETIMEUNIT unit + npy_datetime ival + + unit = get_datetime64_unit(val) + ival = get_datetime64_value(val) + + if unit != PANDAS_FR_ns: + pandas_datetime_to_datetimestruct(ival, unit, &dts) + check_dts_bounds(&dts) + ival = dtstruct_to_dt64(&dts) + + return ival # ---------------------------------------------------------------------- # _TSObject Conversion @@ -48,6 +90,241 @@ cdef class _TSObject: return self.value +cdef convert_to_tsobject(object ts, object tz, object unit, + bint dayfirst, bint yearfirst): + """ + Extract datetime and int64 from any of: + - np.int64 (with unit providing a possible modifier) + - np.datetime64 + - a float (with unit providing a possible modifier) + - python int or long object (with unit providing a possible modifier) + - iso8601 string object + - python datetime object + - another timestamp object + """ + cdef: + _TSObject obj + + if tz is not None: + tz = maybe_get_tz(tz) + + obj = _TSObject() + + if is_string_object(ts): + return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + + if ts is None or ts is NaT: + obj.value = NPY_NAT + elif is_datetime64_object(ts): + if ts.view('i8') == NPY_NAT: + obj.value = NPY_NAT + else: + obj.value = get_datetime64_nanos(ts) + dt64_to_dtstruct(obj.value, &obj.dts) + elif is_integer_object(ts): + if ts == NPY_NAT: + obj.value = NPY_NAT + else: + ts = ts * cast_from_unit(None, unit) + obj.value = ts + dt64_to_dtstruct(ts, &obj.dts) + elif is_float_object(ts): + if ts != ts or ts == NPY_NAT: + obj.value = NPY_NAT + else: + ts = cast_from_unit(ts, unit) + obj.value = ts + dt64_to_dtstruct(ts, &obj.dts) + elif PyDateTime_Check(ts): + return convert_datetime_to_tsobject(ts, tz) + elif PyDate_Check(ts): + # Keep the converter same as PyDateTime's + ts = datetime.combine(ts, datetime_time()) + return convert_datetime_to_tsobject(ts, tz) + elif getattr(ts, '_typ', None) == 'period': + raise ValueError("Cannot convert Period to Timestamp " + "unambiguously. Use to_timestamp") + else: + raise TypeError('Cannot convert input [{}] of type {} to ' + 'Timestamp'.format(ts, type(ts))) + + if obj.value != NPY_NAT: + check_dts_bounds(&obj.dts) + + if tz is not None: + _localize_tso(obj, tz) + + return obj + + +cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, + int32_t nanos=0): + """ + Convert a datetime (or Timestamp) input `ts`, along with optional timezone + object `tz` to a _TSObject. + + The optional argument `nanos` allows for cases where datetime input + needs to be supplemented with higher-precision information. + + Parameters + ---------- + ts : datetime or Timestamp + Value to be converted to _TSObject + tz : tzinfo or None + timezone for the timezone-aware output + nanos : int32_t, default is 0 + nanoseconds supplement the precision of the datetime input ts + + Returns + ------- + obj : _TSObject + """ + cdef: + _TSObject obj = _TSObject() + + if tz is not None: + tz = maybe_get_tz(tz) + + # sort of a temporary hack + if ts.tzinfo is not None: + if hasattr(tz, 'normalize') and hasattr(ts.tzinfo, '_utcoffset'): + ts = tz.normalize(ts) + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + else: + # tzoffset + try: + tz = ts.astimezone(tz).tzinfo + except: + pass + obj.value = pydatetime_to_dt64(ts, &obj.dts) + ts_offset = get_utcoffset(ts.tzinfo, ts) + obj.value -= int(ts_offset.total_seconds() * 1e9) + tz_offset = get_utcoffset(tz, ts) + obj.value += int(tz_offset.total_seconds() * 1e9) + dt64_to_dtstruct(obj.value, &obj.dts) + obj.tzinfo = tz + elif not is_utc(tz): + ts = _localize_pydatetime(ts, tz) + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + else: + # UTC + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = pytz.utc + else: + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + + if obj.tzinfo is not None and not is_utc(obj.tzinfo): + offset = get_utcoffset(obj.tzinfo, ts) + obj.value -= int(offset.total_seconds() * 1e9) + + if not PyDateTime_CheckExact(ts): + # datetime instance but not datetime type --> Timestamp + obj.value += ts.nanosecond + obj.dts.ps = ts.nanosecond * 1000 + + if nanos: + obj.value += nanos + obj.dts.ps = nanos * 1000 + + check_dts_bounds(&obj.dts) + return obj + + +cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, + bint dayfirst=False, + bint yearfirst=False): + """ + Convert a string-like (bytes or unicode) input `ts`, along with optional + timezone object `tz` to a _TSObject. + + The optional arguments `dayfirst` and `yearfirst` are passed to the + dateutil parser. + + Parameters + ---------- + ts : bytes or unicode + Value to be converted to _TSObject + tz : tzinfo or None + timezone for the timezone-aware output + dayfirst : bool, default False + When parsing an ambiguous date string, interpret e.g. "3/4/1975" as + April 3, as opposed to the standard US interpretation March 4. + yearfirst : bool, default False + When parsing an ambiguous date string, interpret e.g. "01/05/09" + as "May 9, 2001", as opposed to the default "Jan 5, 2009" + + Returns + ------- + obj : _TSObject + """ + cdef: + _TSObject obj + int out_local = 0, out_tzoffset = 0 + datetime dt + + if tz is not None: + tz = maybe_get_tz(tz) + + obj = _TSObject() + + assert is_string_object(ts) + + if len(ts) == 0 or ts in nat_strings: + ts = NaT + elif ts == 'now': + # Issue 9000, we short-circuit rather than going + # into np_datetime_strings which returns utc + ts = datetime.now(tz) + elif ts == 'today': + # Issue 9000, we short-circuit rather than going + # into np_datetime_strings which returns a normalized datetime + ts = datetime.now(tz) + # equiv: datetime.today().replace(tzinfo=tz) + else: + try: + _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) + obj.value = dtstruct_to_dt64(&obj.dts) + check_dts_bounds(&obj.dts) + if out_local == 1: + obj.tzinfo = pytz.FixedOffset(out_tzoffset) + obj.value = tz_convert_single(obj.value, obj.tzinfo, 'UTC') + if tz is None: + check_dts_bounds(&obj.dts) + return obj + else: + # Keep the converter same as PyDateTime's + obj = convert_to_tsobject(obj.value, obj.tzinfo, + None, 0, 0) + dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, + obj.dts.hour, obj.dts.min, obj.dts.sec, + obj.dts.us, obj.tzinfo) + obj = convert_datetime_to_tsobject(dt, tz, + nanos=obj.dts.ps / 1000) + return obj + + else: + ts = obj.value + if tz is not None: + # shift for _localize_tso + ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, + ambiguous='raise', + errors='raise')[0] + except ValueError: + try: + ts = parse_datetime_string(ts, dayfirst=dayfirst, + yearfirst=yearfirst) + except Exception: + raise ValueError("could not convert string to Timestamp") + + return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + + +# ---------------------------------------------------------------------- +# Localization + cdef inline void _localize_tso(_TSObject obj, object tz): """ Take a TSObject in UTC and localizes to timezone tz. @@ -55,6 +332,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz): cdef: ndarray[int64_t] trans, deltas Py_ssize_t delta, posn + datetime dt if is_utc(tz): obj.tzinfo = tz @@ -99,8 +377,24 @@ cdef inline void _localize_tso(_TSObject obj, object tz): obj.tzinfo = tz +cdef inline datetime _localize_pydatetime(datetime dt, tzinfo tz): + """ + Take a datetime/Timestamp in UTC and localizes to timezone tz. + + NB: Unlike the version in tslib, this treats datetime and Timestamp objects + identically, i.e. discards nanos from Timestamps. + It also assumes that the `tz` input is not None. + """ + if tz == 'UTC' or tz is UTC: + return UTC.localize(dt) + try: + # datetime.replace with pytz may be incorrect result + return tz.localize(dt) + except AttributeError: + return dt.replace(tzinfo=tz) + # ---------------------------------------------------------------------- -# Localization / Timezone Conversion +# Timezone Conversion cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): @@ -126,6 +420,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): Py_ssize_t pos int64_t v, offset, utc_date pandas_datetimestruct dts + datetime dt if val == NPY_NAT: return val @@ -190,6 +485,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): ndarray[Py_ssize_t] posn int64_t v, offset, delta pandas_datetimestruct dts + datetime dt if len(vals) == 0: return np.array([], dtype=np.int64) @@ -281,6 +577,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): return result +# TODO: cdef scalar version to call from convert_str_to_tsobject @cython.boundscheck(False) @cython.wraparound(False) def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, @@ -303,6 +600,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, pandas_datetimestruct dts bint infer_dst = False, is_dst = False, fill = False bint is_coerce = errors == 'coerce', is_raise = errors == 'raise' + datetime dt # Vectorized version of DstTzInfo.localize @@ -323,7 +621,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result[i] = v - delta return result - if util.is_string_object(ambiguous): + if is_string_object(ambiguous): if ambiguous == 'infer': infer_dst = True elif ambiguous == 'NaT': From aa5ea0ffbf6b48039a542823ad8759b5901d19d8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Nov 2017 17:57:39 -0700 Subject: [PATCH 21/44] Port Timedelta implementation to tslibs.timedeltas (#17937) --- pandas/_libs/tslib.pyx | 412 ++++------------------------- pandas/_libs/tslibs/timedeltas.pxd | 13 + pandas/_libs/tslibs/timedeltas.pyx | 325 +++++++++++++++++++++++ 3 files changed, 394 insertions(+), 356 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d87a6b958a07b..5a4af4550f589 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -74,7 +74,6 @@ cimport cython from pandas.compat import iteritems -import collections import warnings import pytz @@ -1791,17 +1790,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult +from tslibs.timedeltas cimport _Timedelta as __Timedelta + # Similar to Timestamp/datetime, this is a construction requirement for # timedeltas that we need to do object instantiation in python. This will # serve as a C extension type that shadows the Python class, where we do any # heavy lifting. -cdef class _Timedelta(timedelta): - - cdef readonly: - int64_t value # nanoseconds - object freq # frequency reference - bint is_populated # are my components populated - int64_t _sign, _d, _h, _m, _s, _ms, _us, _ns +cdef class _Timedelta(__Timedelta): def __hash__(_Timedelta self): if self._has_ns(): @@ -1848,86 +1843,65 @@ cdef class _Timedelta(timedelta): return cmp_scalar(self.value, ots.value, op) - def _ensure_components(_Timedelta self): - """ - compute the components - """ - cdef int64_t sfrac, ifrac, frac, ivalue = self.value - if self.is_populated: - return +def _binary_op_method_timedeltalike(op, name): + # define a binary operation that only works if the other argument is + # timedelta like or an array of timedeltalike + def f(self, other): + # an offset + if hasattr(other, 'delta') and not isinstance(other, Timedelta): + return op(self, other.delta) - # put frac in seconds - frac = ivalue /(1000 *1000 *1000) - if frac < 0: - self._sign = -1 + # a datetimelike + if (isinstance(other, (datetime, np.datetime64)) + and not (isinstance(other, Timestamp) or other is NaT)): + return op(self, Timestamp(other)) - # even fraction - if (-frac % 86400) != 0: - self._d = -frac /86400 + 1 - frac += 86400 *self._d - else: - frac = -frac - else: - self._sign = 1 - self._d = 0 + # nd-array like + if hasattr(other, 'dtype'): + if other.dtype.kind not in ['m', 'M']: + # raise rathering than letting numpy return wrong answer + return NotImplemented + return op(self.to_timedelta64(), other) - if frac >= 86400: - self._d += frac / 86400 - frac -= self._d * 86400 + if not _validate_ops_compat(other): + return NotImplemented - if frac >= 3600: - self._h = frac / 3600 - frac -= self._h * 3600 - else: - self._h = 0 + if other is NaT: + return NaT - if frac >= 60: - self._m = frac / 60 - frac -= self._m * 60 - else: - self._m = 0 + try: + other = Timedelta(other) + except ValueError: + # failed to parse as timedelta + return NotImplemented - if frac >= 0: - self._s = frac - frac -= self._s - else: - self._s = 0 + return Timedelta(op(self.value, other.value), unit='ns') - sfrac = (self._h * 3600 + self._m * 60 - + self._s) * (1000 * 1000 * 1000) - if self._sign < 0: - ifrac = ivalue + self._d *DAY_NS - sfrac - else: - ifrac = ivalue - (self._d *DAY_NS + sfrac) - - if ifrac != 0: - self._ms = ifrac /(1000 *1000) - ifrac -= self._ms *1000 *1000 - self._us = ifrac /1000 - ifrac -= self._us *1000 - self._ns = ifrac - else: - self._ms = 0 - self._us = 0 - self._ns = 0 + f.__name__ = name + return f - self.is_populated = 1 - cpdef timedelta to_pytimedelta(_Timedelta self): - """ - return an actual datetime.timedelta object - note: we lose nanosecond resolution if any - """ - return timedelta(microseconds=int(self.value) /1000) +def _op_unary_method(func, name): + + def f(self): + return Timedelta(func(self.value), unit='ns') + f.__name__ = name + return f + - cpdef bint _has_ns(self): - return self.value % 1000 != 0 +cdef bint _validate_ops_compat(other): + # return True if we are compat with operating + if _checknull_with_nat(other): + return True + elif PyDelta_Check(other) or is_timedelta64_object(other): + return True + elif util.is_string_object(other): + return True + elif hasattr(other, 'delta'): + return True + return False -# components named tuple -Components = collections.namedtuple('Components', [ - 'days', 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds', 'nanoseconds']) # Python front end to C extension type _Timedelta # This serves as the box for timedelta64 @@ -2015,48 +1989,18 @@ class Timedelta(_Timedelta): return NaT # make timedelta happy - td_base = _Timedelta.__new__(cls, microseconds=int(value) /1000) + td_base = _Timedelta.__new__(cls, microseconds=int(value) / 1000) td_base.value = value td_base.is_populated = 0 return td_base - @property - def delta(self): - """ return out delta in ns (for internal compat) """ - return self.value - - @property - def asm8(self): - """ return a numpy timedelta64 array view of myself """ - return np.int64(self.value).view('m8[ns]') - - @property - def resolution(self): - """ return a string representing the lowest resolution that we have """ - - self._ensure_components() - if self._ns: - return "N" - elif self._us: - return "U" - elif self._ms: - return "L" - elif self._s: - return "S" - elif self._m: - return "T" - elif self._h: - return "H" - else: - return "D" - def _round(self, freq, rounder): cdef int64_t result, unit from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos - result = unit *rounder(self.value /float(unit)) + result = unit * rounder(self.value / float(unit)) return Timedelta(result, unit='ns') def round(self, freq): @@ -2097,182 +2041,6 @@ class Timedelta(_Timedelta): """ return self._round(freq, np.ceil) - def _repr_base(self, format=None): - """ - - Parameters - ---------- - format : None|all|even_day|sub_day|long - - Returns - ------- - converted : string of a Timedelta - - """ - cdef object sign_pretty, sign2_pretty, seconds_pretty, subs - - self._ensure_components() - - if self._sign < 0: - sign_pretty = "-" - sign2_pretty = " +" - else: - sign_pretty = "" - sign2_pretty = " " - - # show everything - if format == 'all': - seconds_pretty = "%02d.%03d%03d%03d" % ( - self._s, self._ms, self._us, self._ns) - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, - sign2_pretty, self._h, - self._m, seconds_pretty) - - # by default not showing nano - if self._ms or self._us or self._ns: - seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) - else: - seconds_pretty = "%02d" % self._s - - # if we have a partial day - subs = (self._h or self._m or self._s or - self._ms or self._us or self._ns) - - if format == 'even_day': - if not subs: - return "%s%d days" % (sign_pretty, self._d) - - elif format == 'sub_day': - if not self._d: - - # degenerate, don't need the extra space - if self._sign > 0: - sign2_pretty = "" - return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, - self._h, self._m, seconds_pretty) - - if subs or format=='long': - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, - sign2_pretty, self._h, - self._m, seconds_pretty) - return "%s%d days" % (sign_pretty, self._d) - - def __repr__(self): - return "Timedelta('{0}')".format(self._repr_base(format='long')) - - def __str__(self): - return self._repr_base(format='long') - - @property - def components(self): - """ Return a Components NamedTuple-like """ - self._ensure_components() - if self._sign < 0: - return Components(-self._d, self._h, self._m, self._s, - self._ms, self._us, self._ns) - - # return the named tuple - return Components(self._d, self._h, self._m, self._s, - self._ms, self._us, self._ns) - - @property - def days(self): - """ - Number of Days - - .components will return the shown components - """ - self._ensure_components() - if self._sign < 0: - return -1 *self._d - return self._d - - @property - def seconds(self): - """ - Number of seconds (>= 0 and less than 1 day). - - .components will return the shown components - """ - self._ensure_components() - return self._h *3600 + self._m *60 + self._s - - @property - def microseconds(self): - """ - Number of microseconds (>= 0 and less than 1 second). - - .components will return the shown components - """ - self._ensure_components() - return self._ms *1000 + self._us - - @property - def nanoseconds(self): - """ - Number of nanoseconds (>= 0 and less than 1 microsecond). - - .components will return the shown components - """ - self._ensure_components() - return self._ns - - def total_seconds(self): - """ - Total duration of timedelta in seconds (to ns precision) - """ - return 1e-9 *self.value - - def isoformat(self): - """ - Format Timedelta as ISO 8601 Duration like - `P[n]Y[n]M[n]DT[n]H[n]M[n]S`, where the `[n]`s are replaced by the - values. See https://en.wikipedia.org/wiki/ISO_8601#Durations - - .. versionadded:: 0.20.0 - - Returns - ------- - formatted : str - - Notes - ----- - The longest component is days, whose value may be larger than - 365. - Every component is always included, even if its value is 0. - Pandas uses nanosecond precision, so up to 9 decimal places may - be included in the seconds component. - Trailing 0's are removed from the seconds component after the decimal. - We do not 0 pad components, so it's `...T5H...`, not `...T05H...` - - Examples - -------- - >>> td = pd.Timedelta(days=6, minutes=50, seconds=3, - ... milliseconds=10, microseconds=10, nanoseconds=12) - >>> td.isoformat() - 'P6DT0H50M3.010010012S' - >>> pd.Timedelta(hours=1, seconds=10).isoformat() - 'P0DT0H0M10S' - >>> pd.Timedelta(hours=1, seconds=10).isoformat() - 'P0DT0H0M10S' - >>> pd.Timedelta(days=500.5).isoformat() - 'P500DT12H0MS' - - See Also - -------- - Timestamp.isoformat - """ - components = self.components - seconds = '{}.{:0>3}{:0>3}{:0>3}'.format(components.seconds, - components.milliseconds, - components.microseconds, - components.nanoseconds) - # Trim unnecessary 0s, 1.000000000 -> 1 - seconds = seconds.rstrip('0').rstrip('.') - tpl = 'P{td.days}DT{td.hours}H{td.minutes}M{seconds}S'.format( - td=components, seconds=seconds) - return tpl - def __setstate__(self, state): (value) = state self.value = value @@ -2281,67 +2049,6 @@ class Timedelta(_Timedelta): object_state = self.value, return (Timedelta, object_state) - def view(self, dtype): - """ array view compat """ - return np.timedelta64(self.value).view(dtype) - - def to_timedelta64(self): - """ Returns a numpy.timedelta64 object with 'ns' precision """ - return np.timedelta64(self.value, 'ns') - - def _validate_ops_compat(self, other): - - # return True if we are compat with operating - if _checknull_with_nat(other): - return True - elif PyDelta_Check(other) or is_timedelta64_object(other): - return True - elif is_string_object(other): - return True - elif hasattr(other, 'delta'): - return True - return False - - # higher than np.ndarray and np.matrix - __array_priority__ = 100 - - def _binary_op_method_timedeltalike(op, name): - # define a binary operation that only works if the other argument is - # timedelta like or an array of timedeltalike - def f(self, other): - # an offset - if hasattr(other, 'delta') and not isinstance(other, Timedelta): - return op(self, other.delta) - - # a datetimelike - if (isinstance(other, (datetime, np.datetime64)) - and not (isinstance(other, Timestamp) or other is NaT)): - return op(self, Timestamp(other)) - - # nd-array like - if hasattr(other, 'dtype'): - if other.dtype.kind not in ['m', 'M']: - # raise rathering than letting numpy return wrong answer - return NotImplemented - return op(self.to_timedelta64(), other) - - if not self._validate_ops_compat(other): - return NotImplemented - - if other is NaT: - return NaT - - try: - other = Timedelta(other) - except ValueError: - # failed to parse as timedelta - return NotImplemented - - return Timedelta(op(self.value, other.value), unit='ns') - - f.__name__ = name - return f - __add__ = _binary_op_method_timedeltalike(lambda x, y: x + y, '__add__') __radd__ = _binary_op_method_timedeltalike(lambda x, y: x + y, '__radd__') __sub__ = _binary_op_method_timedeltalike(lambda x, y: x - y, '__sub__') @@ -2373,7 +2080,7 @@ class Timedelta(_Timedelta): if is_integer_object(other) or is_float_object(other): return Timedelta(self.value /other, unit='ns') - if not self._validate_ops_compat(other): + if not _validate_ops_compat(other): return NotImplemented other = Timedelta(other) @@ -2385,7 +2092,7 @@ class Timedelta(_Timedelta): if hasattr(other, 'dtype'): return other / self.to_timedelta64() - if not self._validate_ops_compat(other): + if not _validate_ops_compat(other): return NotImplemented other = Timedelta(other) @@ -2410,7 +2117,7 @@ class Timedelta(_Timedelta): if is_integer_object(other): return Timedelta(self.value // other, unit='ns') - if not self._validate_ops_compat(other): + if not _validate_ops_compat(other): return NotImplemented other = Timedelta(other) @@ -2425,7 +2132,7 @@ class Timedelta(_Timedelta): other = other.astype('m8[ns]').astype('i8') return other // self.value - if not self._validate_ops_compat(other): + if not _validate_ops_compat(other): return NotImplemented other = Timedelta(other) @@ -2433,13 +2140,6 @@ class Timedelta(_Timedelta): return NaT return other.value // self.value - def _op_unary_method(func, name): - - def f(self): - return Timedelta(func(self.value), unit='ns') - f.__name__ = name - return f - __inv__ = _op_unary_method(lambda x: -x, '__inv__') __neg__ = _op_unary_method(lambda x: -x, '__neg__') __pos__ = _op_unary_method(lambda x: x, '__pos__') diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index 7f1d6bc926894..4dfd3f3e9eca5 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -1,8 +1,21 @@ # -*- coding: utf-8 -*- # cython: profile=False +from cpython.datetime cimport timedelta + from numpy cimport int64_t # Exposed for tslib, not intended for outside use. cdef parse_timedelta_string(object ts) cpdef int64_t cast_from_unit(object ts, object unit) except? -1 + + +cdef class _Timedelta(timedelta): + cdef readonly: + int64_t value # nanoseconds + object freq # frequency reference + bint is_populated # are my components populated + int64_t _sign, _d, _h, _m, _s, _ms, _us, _ns + + cpdef timedelta to_pytimedelta(_Timedelta self) + cpdef bint _has_ns(self) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 10c379ad43a63..2f177868a6947 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,13 +1,24 @@ # -*- coding: utf-8 -*- # cython: profile=False +import collections + import sys cdef bint PY3 = (sys.version_info[0] >= 3) from cpython cimport PyUnicode_Check +import numpy as np +cimport numpy as np from numpy cimport int64_t +np.import_array() + +from cpython.datetime cimport (datetime, timedelta, + PyDelta_Check, PyDateTime_IMPORT) +PyDateTime_IMPORT + cimport util +from util cimport is_timedelta64_object from nattype import nat_strings @@ -16,6 +27,13 @@ from nattype import nat_strings cdef int64_t NPY_NAT = util.get_nat() +cdef int64_t DAY_NS = 86400000000000LL + +# components named tuple +Components = collections.namedtuple('Components', [ + 'days', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds', 'nanoseconds']) + cdef dict timedelta_abbrevs = { 'D': 'd', 'd': 'd', 'days': 'd', @@ -296,3 +314,310 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): n = ''.join(number) + '.' + ''.join(frac) return cast_from_unit(float(n), unit) + +# ---------------------------------------------------------------------- +# Timedelta Construction + +# Similar to Timestamp/datetime, this is a construction requirement for +# timedeltas that we need to do object instantiation in python. This will +# serve as a C extension type that shadows the Python class, where we do any +# heavy lifting. +cdef class _Timedelta(timedelta): + # cdef readonly: + # int64_t value # nanoseconds + # object freq # frequency reference + # bint is_populated # are my components populated + # int64_t _sign, _d, _h, _m, _s, _ms, _us, _ns + + # higher than np.ndarray and np.matrix + __array_priority__ = 100 + + cpdef bint _has_ns(self): + return self.value % 1000 != 0 + + def _ensure_components(_Timedelta self): + """ + compute the components + """ + cdef int64_t sfrac, ifrac, frac, ivalue = self.value + + if self.is_populated: + return + + # put frac in seconds + frac = ivalue / (1000 * 1000 * 1000) + if frac < 0: + self._sign = -1 + + # even fraction + if (-frac % 86400) != 0: + self._d = -frac / 86400 + 1 + frac += 86400 * self._d + else: + frac = -frac + else: + self._sign = 1 + self._d = 0 + + if frac >= 86400: + self._d += frac / 86400 + frac -= self._d * 86400 + + if frac >= 3600: + self._h = frac / 3600 + frac -= self._h * 3600 + else: + self._h = 0 + + if frac >= 60: + self._m = frac / 60 + frac -= self._m * 60 + else: + self._m = 0 + + if frac >= 0: + self._s = frac + frac -= self._s + else: + self._s = 0 + + sfrac = (self._h * 3600 + self._m * 60 + + self._s) * (1000 * 1000 * 1000) + if self._sign < 0: + ifrac = ivalue + self._d * DAY_NS - sfrac + else: + ifrac = ivalue - (self._d * DAY_NS + sfrac) + + if ifrac != 0: + self._ms = ifrac / (1000 * 1000) + ifrac -= self._ms * 1000 * 1000 + self._us = ifrac / 1000 + ifrac -= self._us * 1000 + self._ns = ifrac + else: + self._ms = 0 + self._us = 0 + self._ns = 0 + + self.is_populated = 1 + + cpdef timedelta to_pytimedelta(_Timedelta self): + """ + return an actual datetime.timedelta object + note: we lose nanosecond resolution if any + """ + return timedelta(microseconds=int(self.value) / 1000) + + def to_timedelta64(self): + """ Returns a numpy.timedelta64 object with 'ns' precision """ + return np.timedelta64(self.value, 'ns') + + def total_seconds(self): + """ + Total duration of timedelta in seconds (to ns precision) + """ + return 1e-9 * self.value + + def view(self, dtype): + """ array view compat """ + return np.timedelta64(self.value).view(dtype) + + @property + def components(self): + """ Return a Components NamedTuple-like """ + self._ensure_components() + if self._sign < 0: + return Components(-self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) + + # return the named tuple + return Components(self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) + + @property + def delta(self): + """ return out delta in ns (for internal compat) """ + return self.value + + @property + def asm8(self): + """ return a numpy timedelta64 array view of myself """ + return np.int64(self.value).view('m8[ns]') + + @property + def resolution(self): + """ return a string representing the lowest resolution that we have """ + + self._ensure_components() + if self._ns: + return "N" + elif self._us: + return "U" + elif self._ms: + return "L" + elif self._s: + return "S" + elif self._m: + return "T" + elif self._h: + return "H" + else: + return "D" + + @property + def days(self): + """ + Number of Days + + .components will return the shown components + """ + self._ensure_components() + if self._sign < 0: + return -1 * self._d + return self._d + + @property + def seconds(self): + """ + Number of seconds (>= 0 and less than 1 day). + + .components will return the shown components + """ + self._ensure_components() + return self._h * 3600 + self._m * 60 + self._s + + @property + def microseconds(self): + """ + Number of microseconds (>= 0 and less than 1 second). + + .components will return the shown components + """ + self._ensure_components() + return self._ms * 1000 + self._us + + @property + def nanoseconds(self): + """ + Number of nanoseconds (>= 0 and less than 1 microsecond). + + .components will return the shown components + """ + self._ensure_components() + return self._ns + + def _repr_base(self, format=None): + """ + + Parameters + ---------- + format : None|all|even_day|sub_day|long + + Returns + ------- + converted : string of a Timedelta + + """ + cdef object sign_pretty, sign2_pretty, seconds_pretty, subs + + self._ensure_components() + + if self._sign < 0: + sign_pretty = "-" + sign2_pretty = " +" + else: + sign_pretty = "" + sign2_pretty = " " + + # show everything + if format == 'all': + seconds_pretty = "%02d.%03d%03d%03d" % ( + self._s, self._ms, self._us, self._ns) + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) + + # by default not showing nano + if self._ms or self._us or self._ns: + seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) + else: + seconds_pretty = "%02d" % self._s + + # if we have a partial day + subs = (self._h or self._m or self._s or + self._ms or self._us or self._ns) + + if format == 'even_day': + if not subs: + return "%s%d days" % (sign_pretty, self._d) + + elif format == 'sub_day': + if not self._d: + + # degenerate, don't need the extra space + if self._sign > 0: + sign2_pretty = "" + return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, + self._h, self._m, seconds_pretty) + + if subs or format=='long': + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) + return "%s%d days" % (sign_pretty, self._d) + + def __repr__(self): + return "Timedelta('{0}')".format(self._repr_base(format='long')) + + def __str__(self): + return self._repr_base(format='long') + + def isoformat(self): + """ + Format Timedelta as ISO 8601 Duration like + `P[n]Y[n]M[n]DT[n]H[n]M[n]S`, where the `[n]`s are replaced by the + values. See https://en.wikipedia.org/wiki/ISO_8601#Durations + + .. versionadded:: 0.20.0 + + Returns + ------- + formatted : str + + Notes + ----- + The longest component is days, whose value may be larger than + 365. + Every component is always included, even if its value is 0. + Pandas uses nanosecond precision, so up to 9 decimal places may + be included in the seconds component. + Trailing 0's are removed from the seconds component after the decimal. + We do not 0 pad components, so it's `...T5H...`, not `...T05H...` + + Examples + -------- + >>> td = pd.Timedelta(days=6, minutes=50, seconds=3, + ... milliseconds=10, microseconds=10, nanoseconds=12) + >>> td.isoformat() + 'P6DT0H50M3.010010012S' + >>> pd.Timedelta(hours=1, seconds=10).isoformat() + 'P0DT0H0M10S' + >>> pd.Timedelta(hours=1, seconds=10).isoformat() + 'P0DT0H0M10S' + >>> pd.Timedelta(days=500.5).isoformat() + 'P500DT12H0MS' + + See Also + -------- + Timestamp.isoformat + """ + components = self.components + seconds = '{}.{:0>3}{:0>3}{:0>3}'.format(components.seconds, + components.milliseconds, + components.microseconds, + components.nanoseconds) + # Trim unnecessary 0s, 1.000000000 -> 1 + seconds = seconds.rstrip('0').rstrip('.') + tpl = 'P{td.days}DT{td.hours}H{td.minutes}M{seconds}S'.format( + td=components, seconds=seconds) + return tpl From 4bfbca9a6d1e75363c401d934a6044bafcf526da Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 3 Nov 2017 08:48:23 -0400 Subject: [PATCH 22/44] COMPAT: compare platform return on 32-bit (#18090) xref #18047 --- pandas/tests/test_algos.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6a5c0ae11abb7..240a7ad4b22f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1243,13 +1243,21 @@ def test_groupsort_indexer(): result = libalgos.groupsort_indexer(a, 1000)[0] # need to use a stable sort + # np.argsort returns int, groupsort_indexer + # always returns int64 expected = np.argsort(a, kind='mergesort') + expected = expected.astype(np.int64) + tm.assert_numpy_array_equal(result, expected) # compare with lexsort + # np.lexsort returns int, groupsort_indexer + # always returns int64 key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) + expected = expected.astype(np.int64) + tm.assert_numpy_array_equal(result, expected) From dd761d380bd25f0ac85fa12279f62ea2450d01b1 Mon Sep 17 00:00:00 2001 From: Manraj Singh Date: Sat, 4 Nov 2017 04:33:51 +0530 Subject: [PATCH 23/44] Fix 18068: Updates merge_asof error, now outputs datatypes (#18082) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/reshape/merge.py | 8 +++++--- pandas/tests/reshape/test_merge_asof.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4adafe7c06450..b7b8240a8d77e 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -102,7 +102,7 @@ Sparse Reshaping ^^^^^^^^^ -- +- Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) - - diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e409090e76944..0234a5563326c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1253,10 +1253,12 @@ def _get_merge_keys(self): join_names) = super(_AsOfMerge, self)._get_merge_keys() # validate index types are the same - for lk, rk in zip(left_join_keys, right_join_keys): + for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): if not is_dtype_equal(lk.dtype, rk.dtype): - raise MergeError("incompatible merge keys, " - "must be the same type") + raise MergeError("incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type" + .format(i=i, lkdtype=lk.dtype, + rkdtype=rk.dtype)) # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: diff --git a/pandas/tests/reshape/test_merge_asof.py b/pandas/tests/reshape/test_merge_asof.py index 78bfa2ff8597c..4b2680b9be592 100644 --- a/pandas/tests/reshape/test_merge_asof.py +++ b/pandas/tests/reshape/test_merge_asof.py @@ -973,3 +973,15 @@ def test_on_float_by_int(self): columns=['symbol', 'exch', 'price', 'mpv']) assert_frame_equal(result, expected) + + def test_merge_datatype_error(self): + """ Tests merge datatype mismatch error """ + msg = 'merge keys \[0\] object and int64, must be the same type' + + left = pd.DataFrame({'left_val': [1, 5, 10], + 'a': ['a', 'b', 'c']}) + right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], + 'a': [1, 2, 3, 6, 7]}) + + with tm.assert_raises_regex(MergeError, msg): + merge_asof(left, right, on='a') From a6353dd36d120cb33f26b3b45006f502c38f8ed9 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sat, 4 Nov 2017 23:43:06 +0900 Subject: [PATCH 24/44] TST: Add regression test for empty DataFrame groupby (#18097) --- pandas/tests/groupby/test_groupby.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6f022aeff577b..2f750a7621905 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2732,6 +2732,16 @@ def h(df, arg3): assert_series_equal(result, expected) + def test_empty_dataframe_groupby(self): + # GH8093 + df = DataFrame(columns=['A', 'B', 'C']) + + result = df.groupby('A').sum() + expected = DataFrame(columns=['B', 'C'], dtype=np.float64) + expected.index.name = 'A' + + assert_frame_equal(result, expected) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From c4409814e578e8bf37513f340f0428b6fbed938a Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 5 Nov 2017 00:19:58 +0900 Subject: [PATCH 25/44] BUG: Fix the error when reading the compressed UTF-16 file (#18091) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/parsers.pyx | 30 +++++++++++------- pandas/io/parsers.py | 4 ++- pandas/tests/io/parser/compression.py | 14 ++++++++ .../tests/io/parser/data/utf16_ex_small.zip | Bin 0 -> 285 bytes 5 files changed, 37 insertions(+), 13 deletions(-) create mode 100644 pandas/tests/io/parser/data/utf16_ex_small.zip diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index b7b8240a8d77e..e19f09b195ce0 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -76,7 +76,7 @@ I/O ^^^ - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - +- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a5ce6c560d844..85857c158f96e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -374,6 +374,17 @@ cdef class TextReader: float_precision=None, skip_blank_lines=True): + # set encoding for native Python and C library + if encoding is not None: + if not isinstance(encoding, bytes): + encoding = encoding.encode('utf-8') + encoding = encoding.lower() + self.c_encoding = encoding + else: + self.c_encoding = NULL + + self.encoding = encoding + self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -495,17 +506,6 @@ cdef class TextReader: self.parser.double_converter_nogil = NULL self.parser.double_converter_withgil = round_trip - # encoding - if encoding is not None: - if not isinstance(encoding, bytes): - encoding = encoding.encode('utf-8') - encoding = encoding.lower() - self.c_encoding = encoding - else: - self.c_encoding = NULL - - self.encoding = encoding - if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) for k in dtype} @@ -684,6 +684,14 @@ cdef class TextReader: else: raise ValueError('Unrecognized compression type: %s' % self.compression) + + if b'utf-16' in (self.encoding or b''): + # we need to read utf-16 through UTF8Recoder. + # if source is utf-16, convert source to utf-8 by UTF8Recoder. + source = com.UTF8Recoder(source, self.encoding.decode('utf-8')) + self.encoding = b'utf-8' + self.c_encoding = self.encoding + self.handle = source if isinstance(source, basestring): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1b6414ea974fa..7f3f5630e49f9 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1671,7 +1671,9 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if 'utf-16' in (kwds.get('encoding') or ''): + if (kwds.get('compression') is None + and 'utf-16' in (kwds.get('encoding') or '')): + # if source is utf-16 plain text, convert source to utf-8 if isinstance(src, compat.string_types): src = open(src, 'rb') self.handles.append(src) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 797c12139656d..84db9d14eee07 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -7,6 +7,7 @@ import pytest +import pandas as pd import pandas.util.testing as tm @@ -157,6 +158,19 @@ def test_read_csv_infer_compression(self): inputs[3].close() + def test_read_csv_compressed_utf16_example(self): + # GH18071 + path = tm.get_data_path('utf16_ex_small.zip') + + result = self.read_csv(path, encoding='utf-16', + compression='zip', sep='\t') + expected = pd.DataFrame({ + u'Country': [u'Venezuela', u'Venezuela'], + u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.'] + }) + + tm.assert_frame_equal(result, expected) + def test_invalid_compression(self): msg = 'Unrecognized compression type: sfark' with tm.assert_raises_regex(ValueError, msg): diff --git a/pandas/tests/io/parser/data/utf16_ex_small.zip b/pandas/tests/io/parser/data/utf16_ex_small.zip new file mode 100644 index 0000000000000000000000000000000000000000..b0560c1b1f6c41307b575f2a86509021b49649f4 GIT binary patch literal 285 zcmWIWW@Zs#U|`^2c&?n{%@J5Cmki|10Ae8q8HUo5G()ra)Qb4x+{Bz5y^@NO&=5`r zX2v6bB0;#cf}4SnIF&U?+Ut|9%K z*xYyP>aL!jYp@|<#>O`+A~_|u(y}H_2)et1Z)sMQR;tn2-aQ>H(-D~jm`P25a**gJ;0ll4Wxq+ M2qS>>dJu;J0ImR5j{pDw literal 0 HcmV?d00001 From 2c3faada687021ca12fc4778c9b0ef7a0bc3ea41 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 5 Nov 2017 02:07:35 +0900 Subject: [PATCH 26/44] BUG: Implement PeriodEngine to fix PeriodIndex truncate bug (#17755) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/_libs/index.pyx | 54 ++++- pandas/_libs/index_class_helper.pxi.in | 5 +- pandas/core/indexes/period.py | 8 +- pandas/tests/indexes/period/test_indexing.py | 196 ++++++++++++++++++- pandas/tests/series/test_period.py | 30 +++ 6 files changed, 289 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c41da4d67afe5..5c64b0a55c09b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -100,7 +100,7 @@ Conversion Indexing ^^^^^^^^ -- +- Bug in :func:`PeriodIndex.truncate` which raises ``TypeError`` when ``PeriodIndex`` is monotonic (:issue:`17717`) - - diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e98c0131e9c44..78eb7b3ae483e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,7 +17,7 @@ from tslib cimport _to_i8 from hashtable cimport HashTable -from pandas._libs import algos, hashtable as _hash +from pandas._libs import algos, period as periodlib, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta from datetime import datetime, timedelta @@ -270,13 +270,16 @@ cdef class IndexEngine: values = self._get_index_values() self.mapping = self._make_hash_table(len(values)) - self.mapping.map_locations(values) + self._call_map_locations(values) if len(self.mapping) == len(values): self.unique = 1 self.need_unique_check = 0 + cpdef _call_map_locations(self, values): + self.mapping.map_locations(values) + def clear_mapping(self): self.mapping = None self.need_monotonic_check = 1 @@ -490,6 +493,53 @@ cdef class TimedeltaEngine(DatetimeEngine): cdef _get_box_dtype(self): return 'm8[ns]' + +cdef class PeriodEngine(Int64Engine): + + cdef _get_index_values(self): + return super(PeriodEngine, self).vgetter() + + cpdef _call_map_locations(self, values): + super(PeriodEngine, self)._call_map_locations(values.view('i8')) + + def _call_monotonic(self, values): + return super(PeriodEngine, self)._call_monotonic(values.view('i8')) + + def get_indexer(self, values): + cdef ndarray[int64_t, ndim=1] ordinals + + super(PeriodEngine, self)._ensure_mapping_populated() + + freq = super(PeriodEngine, self).vgetter().freq + ordinals = periodlib.extract_ordinals(values, freq) + + return self.mapping.lookup(ordinals) + + def get_pad_indexer(self, other, limit=None): + freq = super(PeriodEngine, self).vgetter().freq + ordinal = periodlib.extract_ordinals(other, freq) + + return algos.pad_int64(self._get_index_values(), + np.asarray(ordinal), limit=limit) + + def get_backfill_indexer(self, other, limit=None): + freq = super(PeriodEngine, self).vgetter().freq + ordinal = periodlib.extract_ordinals(other, freq) + + return algos.backfill_int64(self._get_index_values(), + np.asarray(ordinal), limit=limit) + + def get_indexer_non_unique(self, targets): + freq = super(PeriodEngine, self).vgetter().freq + ordinal = periodlib.extract_ordinals(targets, freq) + ordinal_array = np.asarray(ordinal) + + return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array) + + cdef _get_index_values_for_bool_indexer(self): + return self._get_index_values().view('i8') + + cpdef convert_scalar(ndarray arr, object value): # we don't turn integers # into datetimes/timedeltas diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 76c0deef7ebee..b9fc0ddd7ea1c 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -66,7 +66,7 @@ cdef class {{name}}Engine(IndexEngine): raise KeyError(val) {{endif}} - values = self._get_index_values() + values = self._get_index_values_for_bool_indexer() n = len(values) result = np.empty(n, dtype=bool) @@ -86,6 +86,9 @@ cdef class {{name}}Engine(IndexEngine): return last_true return result + + cdef _get_index_values_for_bool_indexer(self): + return self._get_index_values() {{endif}} {{endfor}} diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 148ca2725fbdc..c4938b556c8dd 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -31,7 +31,7 @@ import pandas.tseries.offsets as offsets from pandas._libs.lib import infer_dtype -from pandas._libs import tslib, period +from pandas._libs import tslib, period, index as libindex from pandas._libs.period import (Period, IncompatibleFrequency, get_period_field_arr, _validate_end_alias, _quarter_to_myear) @@ -192,6 +192,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): freq = None + _engine_type = libindex.PeriodEngine + __eq__ = _period_index_cmp('__eq__') __ne__ = _period_index_cmp('__ne__', nat_result=True) __lt__ = _period_index_cmp('__lt__') @@ -275,6 +277,10 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, data = period.extract_ordinals(data, freq) return cls._from_ordinals(data, name=name, freq=freq) + @cache_readonly + def _engine(self): + return self._engine_type(lambda: self, len(self)) + @classmethod def _generate_range(cls, start, end, periods, freq, fields): if freq is not None: diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index efc13a56cd77e..d99eba3e2d5e9 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -6,7 +6,7 @@ import pandas as pd from pandas.util import testing as tm from pandas.compat import lrange -from pandas._libs import tslib +from pandas._libs import tslib, tslibs from pandas import (PeriodIndex, Series, DatetimeIndex, period_range, Period) @@ -310,3 +310,197 @@ def test_take_fill_value(self): with pytest.raises(IndexError): idx.take(np.array([1, -5])) + + def test_get_loc(self): + # GH 17717 + p0 = pd.Period('2017-09-01') + p1 = pd.Period('2017-09-02') + p2 = pd.Period('2017-09-03') + + # get the location of p1/p2 from + # monotonic increasing PeriodIndex with non-duplicate + idx0 = pd.PeriodIndex([p0, p1, p2]) + expected_idx1_p1 = 1 + expected_idx1_p2 = 2 + + assert idx0.get_loc(p1) == expected_idx1_p1 + assert idx0.get_loc(str(p1)) == expected_idx1_p1 + assert idx0.get_loc(p2) == expected_idx1_p2 + assert idx0.get_loc(str(p2)) == expected_idx1_p2 + + pytest.raises(tslibs.parsing.DateParseError, idx0.get_loc, 'foo') + pytest.raises(KeyError, idx0.get_loc, 1.1) + pytest.raises(TypeError, idx0.get_loc, idx0) + + # get the location of p1/p2 from + # monotonic increasing PeriodIndex with duplicate + idx1 = pd.PeriodIndex([p1, p1, p2]) + expected_idx1_p1 = slice(0, 2) + expected_idx1_p2 = 2 + + assert idx1.get_loc(p1) == expected_idx1_p1 + assert idx1.get_loc(str(p1)) == expected_idx1_p1 + assert idx1.get_loc(p2) == expected_idx1_p2 + assert idx1.get_loc(str(p2)) == expected_idx1_p2 + + pytest.raises(tslibs.parsing.DateParseError, idx1.get_loc, 'foo') + pytest.raises(KeyError, idx1.get_loc, 1.1) + pytest.raises(TypeError, idx1.get_loc, idx1) + + # get the location of p1/p2 from + # non-monotonic increasing/decreasing PeriodIndex with duplicate + idx2 = pd.PeriodIndex([p2, p1, p2]) + expected_idx2_p1 = 1 + expected_idx2_p2 = np.array([True, False, True]) + + assert idx2.get_loc(p1) == expected_idx2_p1 + assert idx2.get_loc(str(p1)) == expected_idx2_p1 + tm.assert_numpy_array_equal(idx2.get_loc(p2), expected_idx2_p2) + tm.assert_numpy_array_equal(idx2.get_loc(str(p2)), expected_idx2_p2) + + def test_is_monotonic_increasing(self): + # GH 17717 + p0 = pd.Period('2017-09-01') + p1 = pd.Period('2017-09-02') + p2 = pd.Period('2017-09-03') + + idx_inc0 = pd.PeriodIndex([p0, p1, p2]) + idx_inc1 = pd.PeriodIndex([p0, p1, p1]) + idx_dec0 = pd.PeriodIndex([p2, p1, p0]) + idx_dec1 = pd.PeriodIndex([p2, p1, p1]) + idx = pd.PeriodIndex([p1, p2, p0]) + + assert idx_inc0.is_monotonic_increasing + assert idx_inc1.is_monotonic_increasing + assert not idx_dec0.is_monotonic_increasing + assert not idx_dec1.is_monotonic_increasing + assert not idx.is_monotonic_increasing + + def test_is_monotonic_decreasing(self): + # GH 17717 + p0 = pd.Period('2017-09-01') + p1 = pd.Period('2017-09-02') + p2 = pd.Period('2017-09-03') + + idx_inc0 = pd.PeriodIndex([p0, p1, p2]) + idx_inc1 = pd.PeriodIndex([p0, p1, p1]) + idx_dec0 = pd.PeriodIndex([p2, p1, p0]) + idx_dec1 = pd.PeriodIndex([p2, p1, p1]) + idx = pd.PeriodIndex([p1, p2, p0]) + + assert not idx_inc0.is_monotonic_decreasing + assert not idx_inc1.is_monotonic_decreasing + assert idx_dec0.is_monotonic_decreasing + assert idx_dec1.is_monotonic_decreasing + assert not idx.is_monotonic_decreasing + + def test_is_unique(self): + # GH 17717 + p0 = pd.Period('2017-09-01') + p1 = pd.Period('2017-09-02') + p2 = pd.Period('2017-09-03') + + idx0 = pd.PeriodIndex([p0, p1, p2]) + assert idx0.is_unique + + idx1 = pd.PeriodIndex([p1, p1, p2]) + assert not idx1.is_unique + + def test_contains(self): + # GH 17717 + p0 = pd.Period('2017-09-01') + p1 = pd.Period('2017-09-02') + p2 = pd.Period('2017-09-03') + p3 = pd.Period('2017-09-04') + + ps0 = [p0, p1, p2] + idx0 = pd.PeriodIndex(ps0) + + for p in ps0: + assert idx0.contains(p) + assert p in idx0 + + assert idx0.contains(str(p)) + assert str(p) in idx0 + + assert idx0.contains('2017-09-01 00:00:01') + assert '2017-09-01 00:00:01' in idx0 + + assert idx0.contains('2017-09') + assert '2017-09' in idx0 + + assert not idx0.contains(p3) + assert p3 not in idx0 + + def test_get_value(self): + # GH 17717 + p0 = pd.Period('2017-09-01') + p1 = pd.Period('2017-09-02') + p2 = pd.Period('2017-09-03') + + idx0 = pd.PeriodIndex([p0, p1, p2]) + input0 = np.array([1, 2, 3]) + expected0 = 2 + + result0 = idx0.get_value(input0, p1) + assert result0 == expected0 + + idx1 = pd.PeriodIndex([p1, p1, p2]) + input1 = np.array([1, 2, 3]) + expected1 = np.array([1, 2]) + + result1 = idx1.get_value(input1, p1) + tm.assert_numpy_array_equal(result1, expected1) + + idx2 = pd.PeriodIndex([p1, p2, p1]) + input2 = np.array([1, 2, 3]) + expected2 = np.array([1, 3]) + + result2 = idx2.get_value(input2, p1) + tm.assert_numpy_array_equal(result2, expected2) + + def test_get_indexer(self): + # GH 17717 + p1 = pd.Period('2017-09-01') + p2 = pd.Period('2017-09-04') + p3 = pd.Period('2017-09-07') + + tp0 = pd.Period('2017-08-31') + tp1 = pd.Period('2017-09-02') + tp2 = pd.Period('2017-09-05') + tp3 = pd.Period('2017-09-09') + + idx = pd.PeriodIndex([p1, p2, p3]) + + tm.assert_numpy_array_equal(idx.get_indexer(idx), + np.array([0, 1, 2], dtype=np.intp)) + + target = pd.PeriodIndex([tp0, tp1, tp2, tp3]) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), + np.array([-1, 0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), + np.array([0, 1, 2, -1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), + np.array([0, 0, 1, 2], dtype=np.intp)) + + res = idx.get_indexer(target, 'nearest', + tolerance=pd.Timedelta('1 day')) + tm.assert_numpy_array_equal(res, + np.array([0, 0, 1, -1], dtype=np.intp)) + + def test_get_indexer_non_unique(self): + # GH 17717 + p1 = pd.Period('2017-09-02') + p2 = pd.Period('2017-09-03') + p3 = pd.Period('2017-09-04') + p4 = pd.Period('2017-09-05') + + idx1 = pd.PeriodIndex([p1, p2, p1]) + idx2 = pd.PeriodIndex([p2, p1, p3, p4]) + + result = idx1.get_indexer_non_unique(idx2) + expected_indexer = np.array([1, 0, 2, -1, -1], dtype=np.int64) + expected_missing = np.array([2, 3], dtype=np.int64) + + tm.assert_numpy_array_equal(result[0], expected_indexer) + tm.assert_numpy_array_equal(result[1], expected_missing) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index e907b0edd5c6a..b4ff25d2630b8 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -249,3 +249,33 @@ def test_align_series(self): msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" with tm.assert_raises_regex(period.IncompatibleFrequency, msg): ts + ts.asfreq('D', how="end") + + def test_truncate(self): + # GH 17717 + idx1 = pd.PeriodIndex([ + pd.Period('2017-09-02'), + pd.Period('2017-09-02'), + pd.Period('2017-09-03') + ]) + series1 = pd.Series([1, 2, 3], index=idx1) + result1 = series1.truncate(after='2017-09-02') + + expected_idx1 = pd.PeriodIndex([ + pd.Period('2017-09-02'), + pd.Period('2017-09-02') + ]) + tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1)) + + idx2 = pd.PeriodIndex([ + pd.Period('2017-09-03'), + pd.Period('2017-09-02'), + pd.Period('2017-09-03') + ]) + series2 = pd.Series([1, 2, 3], index=idx2) + result2 = series2.truncate(after='2017-09-02') + + expected_idx2 = pd.PeriodIndex([ + pd.Period('2017-09-03'), + pd.Period('2017-09-02') + ]) + tm.assert_series_equal(result2, pd.Series([1, 2], index=expected_idx2)) From fff48bb6e1b3f97ee2314eba5948b8994bacc5a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Nov 2017 11:10:45 -0700 Subject: [PATCH 27/44] standardize indentation, arrange in allphabetical order (#18104) --- setup.py | 170 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 98 insertions(+), 72 deletions(-) diff --git a/setup.py b/setup.py index 783ded906eba2..684f32d1e7898 100755 --- a/setup.py +++ b/setup.py @@ -480,78 +480,104 @@ def pxd(name): libraries = ['m'] if not is_platform_windows() else [] ext_data = { - '_libs.lib': {'pyxfile': '_libs/lib', - 'depends': lib_depends + tseries_depends}, - '_libs.properties': {'pyxfile': '_libs/properties', 'include': []}, - '_libs.hashtable': {'pyxfile': '_libs/hashtable', - 'pxdfiles': ['_libs/hashtable'], - 'depends': (['pandas/_libs/src/klib/khash_python.h'] + - _pxi_dep['hashtable'])}, - '_libs.tslibs.strptime': {'pyxfile': '_libs/tslibs/strptime', - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.offsets': {'pyxfile': '_libs/tslibs/offsets'}, - '_libs.tslib': {'pyxfile': '_libs/tslib', - 'pxdfiles': ['_libs/src/util'], - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.conversion': {'pyxfile': '_libs/tslibs/conversion', - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.np_datetime': {'pyxfile': '_libs/tslibs/np_datetime', - 'depends': np_datetime_headers, - 'sources': np_datetime_sources}, - '_libs.tslibs.timedeltas': {'pyxfile': '_libs/tslibs/timedeltas'}, - '_libs.tslibs.timezones': {'pyxfile': '_libs/tslibs/timezones'}, - '_libs.tslibs.fields': {'pyxfile': '_libs/tslibs/fields', - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.period': {'pyxfile': '_libs/period', - 'depends': (tseries_depends + - ['pandas/_libs/src/period_helper.h']), - 'sources': np_datetime_sources + [ - 'pandas/_libs/src/period_helper.c']}, - '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing', - 'pxdfiles': ['_libs/src/util']}, - '_libs.tslibs.frequencies': {'pyxfile': '_libs/tslibs/frequencies', - 'pxdfiles': ['_libs/src/util']}, - '_libs.tslibs.nattype': {'pyxfile': '_libs/tslibs/nattype', - 'pxdfiles': ['_libs/src/util']}, - '_libs.index': {'pyxfile': '_libs/index', - 'sources': np_datetime_sources, - 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], - 'depends': _pxi_dep['index']}, - '_libs.algos': {'pyxfile': '_libs/algos', - 'pxdfiles': ['_libs/src/util', - '_libs/algos', '_libs/hashtable'], - 'depends': _pxi_dep['algos']}, - '_libs.groupby': {'pyxfile': '_libs/groupby', - 'pxdfiles': ['_libs/src/util', '_libs/algos'], - 'depends': _pxi_dep['groupby']}, - '_libs.join': {'pyxfile': '_libs/join', - 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], - 'depends': _pxi_dep['join']}, - '_libs.reshape': {'pyxfile': '_libs/reshape', - 'depends': _pxi_dep['reshape']}, - '_libs.indexing': {'pyxfile': '_libs/indexing'}, - '_libs.interval': {'pyxfile': '_libs/interval', - 'pxdfiles': ['_libs/hashtable'], - 'depends': _pxi_dep['interval']}, - '_libs.window': {'pyxfile': '_libs/window', - 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], - 'depends': ['pandas/_libs/src/skiplist.pyx', - 'pandas/_libs/src/skiplist.h']}, - '_libs.parsers': {'pyxfile': '_libs/parsers', - 'depends': ['pandas/_libs/src/parser/tokenizer.h', - 'pandas/_libs/src/parser/io.h', - 'pandas/_libs/src/numpy_helper.h'], - 'sources': ['pandas/_libs/src/parser/tokenizer.c', - 'pandas/_libs/src/parser/io.c']}, - '_libs.sparse': {'pyxfile': '_libs/sparse', - 'depends': _pxi_dep['sparse']}, - '_libs.testing': {'pyxfile': '_libs/testing'}, - '_libs.hashing': {'pyxfile': '_libs/hashing'}, - 'io.sas._sas': {'pyxfile': 'io/sas/sas'}} + '_libs.algos': { + 'pyxfile': '_libs/algos', + 'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'], + 'depends': _pxi_dep['algos']}, + '_libs.groupby': { + 'pyxfile': '_libs/groupby', + 'pxdfiles': ['_libs/src/util', '_libs/algos'], + 'depends': _pxi_dep['groupby']}, + '_libs.hashing': { + 'pyxfile': '_libs/hashing'}, + '_libs.hashtable': { + 'pyxfile': '_libs/hashtable', + 'pxdfiles': ['_libs/hashtable'], + 'depends': (['pandas/_libs/src/klib/khash_python.h'] + + _pxi_dep['hashtable'])}, + '_libs.index': { + 'pyxfile': '_libs/index', + 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'depends': _pxi_dep['index'], + 'sources': np_datetime_sources}, + '_libs.indexing': { + 'pyxfile': '_libs/indexing'}, + '_libs.interval': { + 'pyxfile': '_libs/interval', + 'pxdfiles': ['_libs/hashtable'], + 'depends': _pxi_dep['interval']}, + '_libs.join': { + 'pyxfile': '_libs/join', + 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'depends': _pxi_dep['join']}, + '_libs.lib': { + 'pyxfile': '_libs/lib', + 'depends': lib_depends + tseries_depends}, + '_libs.parsers': { + 'pyxfile': '_libs/parsers', + 'depends': ['pandas/_libs/src/parser/tokenizer.h', + 'pandas/_libs/src/parser/io.h', + 'pandas/_libs/src/numpy_helper.h'], + 'sources': ['pandas/_libs/src/parser/tokenizer.c', + 'pandas/_libs/src/parser/io.c']}, + '_libs.period': { + 'pyxfile': '_libs/period', + 'depends': tseries_depends + ['pandas/_libs/src/period_helper.h'], + 'sources': np_datetime_sources + ['pandas/_libs/src/period_helper.c']}, + '_libs.properties': { + 'pyxfile': '_libs/properties', + 'include': []}, + '_libs.reshape': { + 'pyxfile': '_libs/reshape', + 'depends': _pxi_dep['reshape']}, + '_libs.sparse': { + 'pyxfile': '_libs/sparse', + 'depends': _pxi_dep['sparse']}, + '_libs.tslib': { + 'pyxfile': '_libs/tslib', + 'pxdfiles': ['_libs/src/util'], + 'depends': tseries_depends, + 'sources': np_datetime_sources}, + '_libs.tslibs.conversion': { + 'pyxfile': '_libs/tslibs/conversion', + 'depends': tseries_depends, + 'sources': np_datetime_sources}, + '_libs.tslibs.fields': { + 'pyxfile': '_libs/tslibs/fields', + 'depends': tseries_depends, + 'sources': np_datetime_sources}, + '_libs.tslibs.frequencies': { + 'pyxfile': '_libs/tslibs/frequencies', + 'pxdfiles': ['_libs/src/util']}, + '_libs.tslibs.nattype': { + 'pyxfile': '_libs/tslibs/nattype', + 'pxdfiles': ['_libs/src/util']}, + '_libs.tslibs.np_datetime': { + 'pyxfile': '_libs/tslibs/np_datetime', + 'depends': np_datetime_headers, + 'sources': np_datetime_sources}, + '_libs.tslibs.offsets': { + 'pyxfile': '_libs/tslibs/offsets'}, + '_libs.tslibs.parsing': { + 'pyxfile': '_libs/tslibs/parsing', + 'pxdfiles': ['_libs/src/util']}, + '_libs.tslibs.strptime': { + 'pyxfile': '_libs/tslibs/strptime', + 'depends': tseries_depends, + 'sources': np_datetime_sources}, + '_libs.tslibs.timedeltas': { + 'pyxfile': '_libs/tslibs/timedeltas'}, + '_libs.tslibs.timezones': { + 'pyxfile': '_libs/tslibs/timezones'}, + '_libs.testing': { + 'pyxfile': '_libs/testing'}, + '_libs.window': { + 'pyxfile': '_libs/window', + 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], + 'depends': ['pandas/_libs/src/skiplist.pyx', + 'pandas/_libs/src/skiplist.h']}, + 'io.sas._sas': { + 'pyxfile': 'io/sas/sas'}} extensions = [] From 00f61bbd657e46d750f0b6f4ceb926b746406788 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 4 Nov 2017 13:29:46 -0700 Subject: [PATCH 28/44] BLD: Make sure to copy ZIP files for parser tests (#18108) Follow-up to gh-18091. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 684f32d1e7898..3464169e8d8d1 100755 --- a/setup.py +++ b/setup.py @@ -758,6 +758,7 @@ def pxd(name): 'parser/data/*.bz2', 'parser/data/*.txt', 'parser/data/*.tar', + 'parser/data/*.zip', 'parser/data/*.tar.gz', 'sas/data/*.csv', 'sas/data/*.xpt', From 69a3b064679c8bfa4ad40718b2fef4a72ec58b77 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 4 Nov 2017 17:14:10 -0400 Subject: [PATCH 29/44] Revert "CI: temp disable scipy on windows 3.6 build (#18078)" (#18105) * Revert "CI: temp disable scipy on windows 3.6 build (#18078)" This reverts commit cd6dc87466e119aabb76d8439df8289d082ea948. * use numpy=1.13 --- appveyor.yml | 2 +- ci/requirements-3.6_WIN.run | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index a1f8886f6d068..44af73b498aa8 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,7 +22,7 @@ environment: PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" - CONDA_NPY: "112" + CONDA_NPY: "113" - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "2.7" diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run index 5d6c074ec1f85..db2d429a2a4ff 100644 --- a/ci/requirements-3.6_WIN.run +++ b/ci/requirements-3.6_WIN.run @@ -1,12 +1,12 @@ python-dateutil pytz -numpy=1.12* +numpy=1.13* bottleneck openpyxl xlsxwriter xlrd xlwt -# scipy +scipy feather-format numexpr pytables From ffd363b584199d7bb429b232c2b3e3e0329a8599 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Nov 2017 14:47:02 -0700 Subject: [PATCH 30/44] Masking and overflow checks for datetimeindex and timedeltaindex ops (#18020) closes #17991 --- doc/source/whatsnew/v0.21.1.txt | 2 ++ pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 6 ++-- pandas/core/indexes/timedeltas.py | 3 +- .../indexes/datetimes/test_arithmetic.py | 34 +++++++++++++++++++ .../indexes/timedeltas/test_arithmetic.py | 20 +++++++++++ 6 files changed, 63 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index e19f09b195ce0..69379ac2fc58c 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -57,6 +57,8 @@ Documentation Changes Bug Fixes ~~~~~~~~~ - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) +- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) +- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index cc9361b550c5b..ebc0d50d8ba05 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -679,7 +679,7 @@ def __sub__(self, other): return self._add_delta(-other) elif is_integer(other): return self.shift(-other) - elif isinstance(other, datetime): + elif isinstance(other, (datetime, np.datetime64)): return self._sub_datelike(other) elif isinstance(other, Period): return self._sub_period(other) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index bc4fe0060dce0..7057cdb5b20cc 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -29,6 +29,7 @@ import pandas.core.dtypes.concat as _concat from pandas.errors import PerformanceWarning from pandas.core.common import _values_from_object, _maybe_box +from pandas.core.algorithms import checked_add_with_arr from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index, Float64Index @@ -767,7 +768,7 @@ def _sub_datelike(self, other): raise TypeError("DatetimeIndex subtraction must have the same " "timezones or no timezones") result = self._sub_datelike_dti(other) - elif isinstance(other, datetime): + elif isinstance(other, (datetime, np.datetime64)): other = Timestamp(other) if other is libts.NaT: result = self._nat_new(box=False) @@ -777,7 +778,8 @@ def _sub_datelike(self, other): "timezones or no timezones") else: i8 = self.asi8 - result = i8 - other.value + result = checked_add_with_arr(i8, -other.value, + arr_mask=self._isnan) result = self._maybe_mask_results(result, fill_value=libts.iNaT) else: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 9647cef608d4e..729edc81bb642 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -361,7 +361,8 @@ def _add_datelike(self, other): else: other = Timestamp(other) i8 = self.asi8 - result = checked_add_with_arr(i8, other.value) + result = checked_add_with_arr(i8, other.value, + arr_mask=self._isnan) result = self._maybe_mask_results(result, fill_value=iNaT) return DatetimeIndex(result, name=self.name, copy=False) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index bf0217e9bf22a..e078413c9398c 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -199,6 +199,40 @@ def test_ufunc_coercions(self): tm.assert_index_equal(result, exp) assert result.freq == 'D' + def test_datetimeindex_sub_timestamp_overflow(self): + dtimax = pd.to_datetime(['now', pd.Timestamp.max]) + dtimin = pd.to_datetime(['now', pd.Timestamp.min]) + + tsneg = Timestamp('1950-01-01') + ts_neg_variants = [tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype('datetime64[ns]'), + tsneg.to_datetime64().astype('datetime64[D]')] + + tspos = Timestamp('1980-01-01') + ts_pos_variants = [tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype('datetime64[ns]'), + tspos.to_datetime64().astype('datetime64[D]')] + + for variant in ts_neg_variants: + with pytest.raises(OverflowError): + dtimax - variant + + expected = pd.Timestamp.max.value - tspos.value + for variant in ts_pos_variants: + res = dtimax - variant + assert res[1].value == expected + + expected = pd.Timestamp.min.value - tsneg.value + for variant in ts_neg_variants: + res = dtimin - variant + assert res[1].value == expected + + for variant in ts_pos_variants: + with pytest.raises(OverflowError): + dtimin - variant + # GH 10699 @pytest.mark.parametrize('klass,assert_func', zip([Series, DatetimeIndex], diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index bbc8dd6577b2c..514702e15f7e1 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -576,6 +576,26 @@ def test_add_overflow(self): to_timedelta(['7 seconds', pd.NaT, '4 hours'])) tm.assert_index_equal(result, exp) + def test_timedeltaindex_add_timestamp_nat_masking(self): + # GH17991 checking for overflow-masking with NaT + tdinat = pd.to_timedelta(['24658 days 11:15:00', 'NaT']) + + tsneg = Timestamp('1950-01-01') + ts_neg_variants = [tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype('datetime64[ns]'), + tsneg.to_datetime64().astype('datetime64[D]')] + + tspos = Timestamp('1980-01-01') + ts_pos_variants = [tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype('datetime64[ns]'), + tspos.to_datetime64().astype('datetime64[D]')] + + for variant in ts_neg_variants + ts_pos_variants: + res = tdinat + variant + assert res[1] is pd.NaT + def test_tdi_ops_attributes(self): rng = timedelta_range('2 days', periods=5, freq='2D', name='x') From 8587a3d932ad65e7f42212456e55f35faea8693d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 4 Nov 2017 21:36:51 -0700 Subject: [PATCH 31/44] BUG: Override mi-columns in to_csv if requested (#18110) Previously, MultiIndex columns weren't being overwritten when header was passed in for to_csv. Closes gh-5539 --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/formats/format.py | 2 +- pandas/tests/frame/test_to_csv.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 69379ac2fc58c..6dc329c4aa732 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -79,6 +79,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) +- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) Plotting ^^^^^^^^ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c5d4a0ecf44ab..ab98b9c4e4f49 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1695,7 +1695,7 @@ def _save_header(self): else: encoded_labels = [] - if not has_mi_columns: + if not has_mi_columns or has_aliases: encoded_labels += list(write_cols) writer.writerow(encoded_labels) else: diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 4162a586f8063..ca8a0d8bda3ab 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1203,3 +1203,16 @@ def test_period_index_date_overflow(self): expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n' assert result == expected + + def test_multi_index_header(self): + # see gh-5539 + columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), + ("b", 1), ("b", 2)]) + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + df.columns = columns + + header = ["a", "b", "c", "d"] + result = df.to_csv(header=header) + + expected = ",a,b,c,d\n0,1,2,3,4\n1,5,6,7,8\n" + assert result == expected From 763b5f71d41e84410780084af3742c88e84786b3 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Sun, 5 Nov 2017 12:04:31 +0100 Subject: [PATCH 32/44] fix failing tests. --- pandas/core/indexes/datetimes.py | 38 +++++++++++++++++--------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 7057cdb5b20cc..4f9692003d18d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1265,7 +1265,7 @@ def _parsed_string_to_bounds(self, reso, parsed): ---------- reso : Resolution Resolution provided by parsed string. - parsed : datetime + parsed : datetime | bool Datetime from parsed string. Returns @@ -1273,61 +1273,63 @@ def _parsed_string_to_bounds(self, reso, parsed): lower, upper: pd.Timestamp """ - if parsed.tzinfo is None: + if not hasattr(parsed, "tzinfo"): + target_tz = None + elif parsed.tzinfo is None: target_tz = self.tz else: target_tz = parsed.tzinfo def translate(timestamp_lower, timestamp_upper): - if target_tz == self.tz: - return timestamp_lower, timestamp_upper - else: + if self.tz is not None and parsed.tzinfo is not None: return ( timestamp_lower.tz_convert(self.tz), timestamp_upper.tz_convert(self.tz) ) + else: + return timestamp_lower, timestamp_upper if reso == 'year': return translate(Timestamp(datetime(parsed.year, 1, 1), tz=target_tz), - Timestamp(datetime(parsed.year, 12, 31, 23, + Timestamp(datetime(parsed.year, 12, 31, 23, 59, 59, 999999), tz=target_tz)) elif reso == 'month': d = libts.monthrange(parsed.year, parsed.month)[1] return translate(Timestamp(datetime(parsed.year, parsed.month, 1), - tz=target_tz), - Timestamp(datetime(parsed.year, parsed.month, d, 23, + tz=target_tz), + Timestamp(datetime(parsed.year, parsed.month, d, 23, 59, 59, 999999), target_tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = libts.monthrange(parsed.year, qe)[1] # at end of month return translate(Timestamp(datetime(parsed.year, parsed.month, 1), - tz=target_tz), - Timestamp(datetime(parsed.year, qe, d, 23, 59, + tz=target_tz), + Timestamp(datetime(parsed.year, qe, d, 23, 59, 59, 999999), tz=target_tz)) elif reso == 'day': st = datetime(parsed.year, parsed.month, parsed.day) return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Day(), - tz=target_tz).value - 1)) + Timestamp(Timestamp(st + offsets.Day(), + tz=target_tz).value - 1)) elif reso == 'hour': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour) return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Hour(), - tz=target_tz).value - 1)) + Timestamp(Timestamp(st + offsets.Hour(), + tz=target_tz).value - 1)) elif reso == 'minute': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute) return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Minute(), - tz=target_tz).value - 1)) + Timestamp(Timestamp(st + offsets.Minute(), + tz=target_tz).value - 1)) elif reso == 'second': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute, second=parsed.second) return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Second(), - tz=target_tz).value - 1)) + Timestamp(Timestamp(st + offsets.Second(), + tz=target_tz).value - 1)) elif reso == 'microsecond': st = datetime(parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute, parsed.second, From fd4917531a00df11fcbbeb24e7ca8893700ebc80 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Sun, 5 Nov 2017 20:49:29 +0100 Subject: [PATCH 33/44] Rewrite naive/timezone matrix condition, Improve test cases --- pandas/core/indexes/datetimes.py | 82 ++++++++++--------- .../tests/indexes/datetimes/test_datetime.py | 8 +- pandas/tests/indexing/test_datetime.py | 78 +++++++++++++----- 3 files changed, 106 insertions(+), 62 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 4f9692003d18d..ece8c2e796d08 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1265,7 +1265,7 @@ def _parsed_string_to_bounds(self, reso, parsed): ---------- reso : Resolution Resolution provided by parsed string. - parsed : datetime | bool + parsed : datetime | object Datetime from parsed string. Returns @@ -1274,69 +1274,71 @@ def _parsed_string_to_bounds(self, reso, parsed): """ if not hasattr(parsed, "tzinfo"): - target_tz = None - elif parsed.tzinfo is None: - target_tz = self.tz + # see the following: + # - TestSlicing.test_partial_slicing_with_multiindex + # - test_partial_setting_with_datetimelike_dtype + pass + elif self.tz is None: + if parsed.tzinfo is None: # both are naive, nothing to do + pass + else: # naive datetime index but label provides timezone + warnings.warn("Access naive datetime index with a label " + "containing a timezone, assume UTC") + parsed = parsed.astimezone(_utc()) else: - target_tz = parsed.tzinfo - - def translate(timestamp_lower, timestamp_upper): - if self.tz is not None and parsed.tzinfo is not None: - return ( - timestamp_lower.tz_convert(self.tz), - timestamp_upper.tz_convert(self.tz) - ) - else: - return timestamp_lower, timestamp_upper + if parsed.tzinfo is None: # treat like in same timezone + parsed = parsed.replace(tzinfo=self.tz) + else: # actual timezone of the label should be considered + parsed = parsed.astimezone(tz=self.tz) if reso == 'year': - return translate(Timestamp(datetime(parsed.year, 1, 1), tz=target_tz), - Timestamp(datetime(parsed.year, 12, 31, 23, - 59, 59, 999999), tz=target_tz)) + return (Timestamp(datetime(parsed.year, 1, 1), tz=self.tz), + Timestamp(datetime(parsed.year, 12, 31, 23, + 59, 59, 999999), tz=self.tz)) elif reso == 'month': d = libts.monthrange(parsed.year, parsed.month)[1] - return translate(Timestamp(datetime(parsed.year, parsed.month, 1), - tz=target_tz), - Timestamp(datetime(parsed.year, parsed.month, d, 23, - 59, 59, 999999), target_tz)) + return (Timestamp(datetime(parsed.year, parsed.month, 1), + tz=self.tz), + Timestamp(datetime(parsed.year, parsed.month, d, + 23, 59, 59, 999999), self.tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = libts.monthrange(parsed.year, qe)[1] # at end of month - return translate(Timestamp(datetime(parsed.year, parsed.month, 1), - tz=target_tz), - Timestamp(datetime(parsed.year, qe, d, 23, 59, - 59, 999999), tz=target_tz)) + return (Timestamp(datetime(parsed.year, parsed.month, 1), + tz=self.tz), + Timestamp(datetime(parsed.year, qe, d, 23, 59, + 59, 999999), tz=self.tz)) elif reso == 'day': st = datetime(parsed.year, parsed.month, parsed.day) - return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Day(), - tz=target_tz).value - 1)) + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Day(), + tz=self.tz).value - 1)) elif reso == 'hour': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour) - return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Hour(), - tz=target_tz).value - 1)) + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Hour(), tz=self.tz + ).value - 1)) elif reso == 'minute': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute) - return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Minute(), - tz=target_tz).value - 1)) + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Minute(), + tz=self.tz).value - 1)) elif reso == 'second': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute, second=parsed.second) - return translate(Timestamp(st, tz=target_tz), - Timestamp(Timestamp(st + offsets.Second(), - tz=target_tz).value - 1)) + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Second(), + tz=self.tz).value - 1)) elif reso == 'microsecond': st = datetime(parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute, parsed.second, parsed.microsecond) - return translate( - Timestamp(st, tz=target_tz), - Timestamp(st, tz=target_tz) + return ( + Timestamp(st, tz=self.tz), + Timestamp(st, tz=self.tz) ) else: raise KeyError diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index cc6eeb44c99c9..f9829b1323b7b 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,3 +1,5 @@ +import warnings + import pytest import numpy as np @@ -211,7 +213,11 @@ def test_stringified_slice_with_tz(self): start = datetime.datetime.now() idx = DatetimeIndex(start=start, freq="1d", periods=10) df = DataFrame(lrange(10), index=idx) - df["2013-01-14 23:44:34.437768-05:00":] # no exception here + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + df["2013-01-14 23:44:34.437768-05:00":] # no exception here + assert len(w) == 1 + assert issubclass(w[-1].category, UserWarning) def test_append_join_nondatetimeindex(self): rng = date_range('1/1/2000', periods=10) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 648b33e9ecc3f..296c6e33753cb 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas as pd from pandas import date_range, Index, DataFrame, Series, Timestamp @@ -123,39 +125,73 @@ def test_consistency_with_tz_aware_scalar(self): result = df[0].at[0] assert result == expected - def test_access_datetimeindex_with_timezoned_label(self): + def test_access_timezoned_datetimeindex_with_timezoned_label(self): - # 6785, timezone was ignored when simple string was provided as a label + # GH 6785 + # timezone was ignored when string was provided as a label - idx = pd.DataFrame(index=pd.date_range('2016-01-01T00:00', - '2016-03-31T23:59', freq='T')) + first_january = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T', tz="UTC") + df = pd.DataFrame(index=first_january, data=np.arange(len( + first_january))) - former_naive_endpoint_idx = idx[ - "2016-01-01T00:00-02:00" - : - "2016-01-01T02:03" + former_naive_endpoint_df = df[ + "2016-01-01T00:00-02:00":"2016-01-01T02:03" ] - former_non_naive_endpoint_idx = idx[ - pd.Timestamp("2016-01-01T00:00-02:00") - : + former_non_naive_endpoint_df = df[ + pd.Timestamp("2016-01-01T00:00-02:00"): pd.Timestamp("2016-01-01T02:03") ] - assert (len(former_naive_endpoint_idx) - == len(former_non_naive_endpoint_idx)) + assert (len(former_naive_endpoint_df.index) == 4) + + assert (former_naive_endpoint_df.iloc[0].name + == former_non_naive_endpoint_df.iloc[0].name) + + assert (former_naive_endpoint_df.iloc[1].name + == former_non_naive_endpoint_df.iloc[1].name) + + assert (former_naive_endpoint_df.iloc[2].name + == former_non_naive_endpoint_df.iloc[2].name) + + assert (former_naive_endpoint_df.iloc[3].name + == former_non_naive_endpoint_df.iloc[3].name) + + def test_access_naive_datetimeindex_with_timezoned_label(self): + + # GH 6785 + # timezone was ignored when string was provided as a label + # this test is for completeness + + first_january = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T') + df = pd.DataFrame(index=first_january, data=np.arange(len( + first_january))) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + former_naive_endpoint_df = df["2016-01-01T00:00-02:00": + "2016-01-01T02:03"] + assert len(w) == 1 + assert issubclass(w[-1].category, UserWarning) + + former_non_naive_endpoint_df = df[pd.Timestamp( + "2016-01-01T00:00-02:00"):pd.Timestamp("2016-01-01T02:03")] + + assert (len(former_naive_endpoint_df.index) == 4) - assert (former_naive_endpoint_idx.iloc[0].name - == former_non_naive_endpoint_idx.iloc[0].name) + assert (former_naive_endpoint_df.iloc[0].name + == former_non_naive_endpoint_df.iloc[0].name) - assert (former_naive_endpoint_idx.iloc[1].name - == former_non_naive_endpoint_idx.iloc[1].name) + assert (former_naive_endpoint_df.iloc[1].name + == former_non_naive_endpoint_df.iloc[1].name) - assert (former_naive_endpoint_idx.iloc[2].name - == former_non_naive_endpoint_idx.iloc[2].name) + assert (former_naive_endpoint_df.iloc[2].name + == former_non_naive_endpoint_df.iloc[2].name) - assert (former_naive_endpoint_idx.iloc[3].name - == former_non_naive_endpoint_idx.iloc[3].name) + assert (former_naive_endpoint_df.iloc[3].name + == former_non_naive_endpoint_df.iloc[3].name) def test_indexing_with_datetimeindex_tz(self): From d944bfda2b0517e3db7b5ffa7b3d8fae644d4dc7 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Sun, 5 Nov 2017 20:52:47 +0100 Subject: [PATCH 34/44] adjust as it was before (un-done changes) --- pandas/core/indexes/datetimes.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ece8c2e796d08..a27100b44760f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1299,8 +1299,8 @@ def _parsed_string_to_bounds(self, reso, parsed): d = libts.monthrange(parsed.year, parsed.month)[1] return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), - Timestamp(datetime(parsed.year, parsed.month, d, - 23, 59, 59, 999999), self.tz)) + Timestamp(datetime(parsed.year, parsed.month, d, 23, + 59, 59, 999999), self.tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = libts.monthrange(parsed.year, qe)[1] # at end of month @@ -1317,8 +1317,8 @@ def _parsed_string_to_bounds(self, reso, parsed): st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour) return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Hour(), tz=self.tz - ).value - 1)) + Timestamp(Timestamp(st + offsets.Hour(), + tz=self.tz).value - 1)) elif reso == 'minute': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute) @@ -1336,10 +1336,7 @@ def _parsed_string_to_bounds(self, reso, parsed): st = datetime(parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute, parsed.second, parsed.microsecond) - return ( - Timestamp(st, tz=self.tz), - Timestamp(st, tz=self.tz) - ) + return Timestamp(st, tz=self.tz), Timestamp(st, tz=self.tz) else: raise KeyError From 1641bf27b7972532d1d92a3f24613a9aca4ee611 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Sun, 5 Nov 2017 20:56:35 +0100 Subject: [PATCH 35/44] Add tz keyword. --- pandas/core/indexes/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index a27100b44760f..eeaa1a74a44d6 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1300,7 +1300,7 @@ def _parsed_string_to_bounds(self, reso, parsed): return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), Timestamp(datetime(parsed.year, parsed.month, d, 23, - 59, 59, 999999), self.tz)) + 59, 59, 999999), tz=self.tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = libts.monthrange(parsed.year, qe)[1] # at end of month From 1a3ab3b9584f02a0af54d974994902fffca3329b Mon Sep 17 00:00:00 2001 From: Date: Mon, 13 Nov 2017 21:39:00 +0100 Subject: [PATCH 36/44] Apply suggestions of review --- pandas/core/indexes/datetimes.py | 2 +- .../tests/indexes/datetimes/test_datetime.py | 5 +- pandas/tests/indexing/test_datetime.py | 68 ------------------- pandas/tests/indexing/test_partial.py | 41 +++++++++++ 4 files changed, 43 insertions(+), 73 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index bae310c3dd668..8fb0f64b2cff0 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1257,7 +1257,7 @@ def _parsed_string_to_bounds(self, reso, parsed): ---------- reso : Resolution Resolution provided by parsed string. - parsed : datetime | object + parsed : datetime or object Datetime from parsed string. Returns diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 5ea419124101f..432d832146818 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -213,11 +213,8 @@ def test_stringified_slice_with_tz(self): start = datetime.datetime.now() idx = DatetimeIndex(start=start, freq="1d", periods=10) df = DataFrame(lrange(10), index=idx) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") + with tm.assert_produces_warning(UserWarning): df["2013-01-14 23:44:34.437768-05:00":] # no exception here - assert len(w) == 1 - assert issubclass(w[-1].category, UserWarning) def test_append_join_nondatetimeindex(self): rng = date_range('1/1/2000', periods=10) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 296c6e33753cb..e2408dba15f3f 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -125,74 +125,6 @@ def test_consistency_with_tz_aware_scalar(self): result = df[0].at[0] assert result == expected - def test_access_timezoned_datetimeindex_with_timezoned_label(self): - - # GH 6785 - # timezone was ignored when string was provided as a label - - first_january = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', - freq='T', tz="UTC") - df = pd.DataFrame(index=first_january, data=np.arange(len( - first_january))) - - former_naive_endpoint_df = df[ - "2016-01-01T00:00-02:00":"2016-01-01T02:03" - ] - - former_non_naive_endpoint_df = df[ - pd.Timestamp("2016-01-01T00:00-02:00"): - pd.Timestamp("2016-01-01T02:03") - ] - - assert (len(former_naive_endpoint_df.index) == 4) - - assert (former_naive_endpoint_df.iloc[0].name - == former_non_naive_endpoint_df.iloc[0].name) - - assert (former_naive_endpoint_df.iloc[1].name - == former_non_naive_endpoint_df.iloc[1].name) - - assert (former_naive_endpoint_df.iloc[2].name - == former_non_naive_endpoint_df.iloc[2].name) - - assert (former_naive_endpoint_df.iloc[3].name - == former_non_naive_endpoint_df.iloc[3].name) - - def test_access_naive_datetimeindex_with_timezoned_label(self): - - # GH 6785 - # timezone was ignored when string was provided as a label - # this test is for completeness - - first_january = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', - freq='T') - df = pd.DataFrame(index=first_january, data=np.arange(len( - first_january))) - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - former_naive_endpoint_df = df["2016-01-01T00:00-02:00": - "2016-01-01T02:03"] - assert len(w) == 1 - assert issubclass(w[-1].category, UserWarning) - - former_non_naive_endpoint_df = df[pd.Timestamp( - "2016-01-01T00:00-02:00"):pd.Timestamp("2016-01-01T02:03")] - - assert (len(former_naive_endpoint_df.index) == 4) - - assert (former_naive_endpoint_df.iloc[0].name - == former_non_naive_endpoint_df.iloc[0].name) - - assert (former_naive_endpoint_df.iloc[1].name - == former_non_naive_endpoint_df.iloc[1].name) - - assert (former_naive_endpoint_df.iloc[2].name - == former_non_naive_endpoint_df.iloc[2].name) - - assert (former_naive_endpoint_df.iloc[3].name - == former_non_naive_endpoint_df.iloc[3].name) - def test_indexing_with_datetimeindex_tz(self): # GH 12050 diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 0e4957da5478c..c8ff4d8325538 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -641,3 +641,44 @@ def test_partial_set_empty_frame_empty_consistencies(self): df.loc[0, 'x'] = 1 expected = DataFrame(dict(x=[1], y=[np.nan])) tm.assert_frame_equal(df, expected, check_dtype=False) + + def test_access_timezoned_datetimeindex_with_timezoned_label(self): + + # GH 6785 + # timezone was ignored when string was provided as a label + + first_january = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T', tz="UTC") + df = pd.DataFrame(index=first_january, data=np.arange(len( + first_january))) + + former_naive_endpoint_df = df[ + "2016-01-01T00:00-02:00":"2016-01-01T02:03" + ] + + former_non_naive_endpoint_df = df[ + pd.Timestamp("2016-01-01T00:00-02:00"): + pd.Timestamp("2016-01-01T02:03") + ] + + tm.assert_frame_equal(former_naive_endpoint_df, former_non_naive_endpoint_df) + + def test_access_naive_datetimeindex_with_timezoned_label(self): + + # GH 6785 + # timezone was ignored when string was provided as a label + # this test is for completeness + + first_january = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T') + df = pd.DataFrame(index=first_january, data=np.arange(len( + first_january))) + + with tm.assert_produces_warning(UserWarning): + former_naive_endpoint_df = df["2016-01-01T00:00-02:00": + "2016-01-01T02:03"] + + former_non_naive_endpoint_df = df[pd.Timestamp( + "2016-01-01T00:00-02:00"):pd.Timestamp("2016-01-01T02:03")] + + tm.assert_frame_equal(former_non_naive_endpoint_df, former_naive_endpoint_df) From 31ef655de0a081ee1d0b22d9a6b3eea13c1ce6f9 Mon Sep 17 00:00:00 2001 From: Date: Mon, 13 Nov 2017 23:01:03 +0100 Subject: [PATCH 37/44] refactor: replace _utc() with utc --- pandas/core/indexes/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 8fb0f64b2cff0..a9800c453f1be 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1276,7 +1276,7 @@ def _parsed_string_to_bounds(self, reso, parsed): else: # naive datetime index but label provides timezone warnings.warn("Access naive datetime index with a label " "containing a timezone, assume UTC") - parsed = parsed.astimezone(_utc()) + parsed = parsed.astimezone(utc) else: if parsed.tzinfo is None: # treat like in same timezone parsed = parsed.replace(tzinfo=self.tz) From edfd8958ec23fcadb8c4f7db5b3a61f6b81d7c88 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Mon, 13 Nov 2017 23:06:23 +0100 Subject: [PATCH 38/44] fix flake8 issues --- pandas/tests/indexes/datetimes/test_datetime.py | 2 -- pandas/tests/indexing/test_datetime.py | 2 -- pandas/tests/indexing/test_partial.py | 6 ++++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 432d832146818..ec5eb6bbee918 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,5 +1,3 @@ -import warnings - import pytest import numpy as np diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index e2408dba15f3f..617757c888eb5 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np import pandas as pd from pandas import date_range, Index, DataFrame, Series, Timestamp diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index c8ff4d8325538..5be1749964f77 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -661,7 +661,8 @@ def test_access_timezoned_datetimeindex_with_timezoned_label(self): pd.Timestamp("2016-01-01T02:03") ] - tm.assert_frame_equal(former_naive_endpoint_df, former_non_naive_endpoint_df) + tm.assert_frame_equal(former_naive_endpoint_df, + former_non_naive_endpoint_df) def test_access_naive_datetimeindex_with_timezoned_label(self): @@ -681,4 +682,5 @@ def test_access_naive_datetimeindex_with_timezoned_label(self): former_non_naive_endpoint_df = df[pd.Timestamp( "2016-01-01T00:00-02:00"):pd.Timestamp("2016-01-01T02:03")] - tm.assert_frame_equal(former_non_naive_endpoint_df, former_naive_endpoint_df) + tm.assert_frame_equal(former_non_naive_endpoint_df, + former_naive_endpoint_df) From 9f0dc5ddec93f04920076965dfe7354657a63f7c Mon Sep 17 00:00:00 2001 From: Date: Tue, 14 Nov 2017 21:08:54 +0100 Subject: [PATCH 39/44] replace datetime.datetime with pd.Timestamp --- pandas/core/indexes/datetimes.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index a9800c453f1be..a868d1fd572a8 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1265,23 +1265,19 @@ def _parsed_string_to_bounds(self, reso, parsed): lower, upper: pd.Timestamp """ - if not hasattr(parsed, "tzinfo"): - # see the following: - # - TestSlicing.test_partial_slicing_with_multiindex - # - test_partial_setting_with_datetimelike_dtype - pass - elif self.tz is None: - if parsed.tzinfo is None: # both are naive, nothing to do + parsed = Timestamp(parsed) + if self.tz is None: + if parsed.tz is None: # both are naive, nothing to do pass else: # naive datetime index but label provides timezone warnings.warn("Access naive datetime index with a label " "containing a timezone, assume UTC") - parsed = parsed.astimezone(utc) + parsed = parsed.tz_convert(utc) else: - if parsed.tzinfo is None: # treat like in same timezone - parsed = parsed.replace(tzinfo=self.tz) + if parsed.tz is None: # treat like in same timezone + parsed = parsed.tz_localize(self.tz) else: # actual timezone of the label should be considered - parsed = parsed.astimezone(tz=self.tz) + parsed = parsed.tz_convert(tz=self.tz) if reso == 'year': return (Timestamp(datetime(parsed.year, 1, 1), tz=self.tz), From 5c11e028dbbc1c9bdec27efdbe6411e044deb3b9 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Thu, 16 Nov 2017 20:40:54 +0100 Subject: [PATCH 40/44] Add whatsnew and documentation. --- doc/source/timeseries.rst | 27 +++++++++++++++++++++++++++ doc/source/whatsnew/v0.21.1.txt | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 26e701d008b3f..843a75aafb622 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -557,6 +557,33 @@ We are stopping on the included end-point as it is part of the index dft2 = dft2.swaplevel(0, 1).sort_index() dft2.loc[idx[:, '2013-01-05'], :] + +``DatetimeIndex`` partial string indexing also works with timezones. +If a timezone is provided by the label, that timezone is respected. +If no timezone is provided, then the same timezone as used in the ``DatetimeIndex`` is assumed. + +.. ipython:: python + + first_january_cet = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T', tz="CET") + df = pd.DataFrame(index=first_january_utc, data=np.arange(len(first_january_utc))) + + four_minute_slice = df["2016-01-01T00:00-01:00":"2016-01-01T02:03"] + + +``DatetimeIndex`` partial string indexing can be used with naive labels as well. +If a timezone is provided by the label, the datetime index is assumed to be UTC and a ``UserWarning`` is emitted. + +.. ipython:: python + + first_january_implicit_utc = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T') + df = pd.DataFrame(index=first_january_implicit_utc, data=np.arange(len(first_january_implicit_utc))) + + four_minute_slice = df["2016-01-01T00:00-02:00":"2016-01-01T02:03"] + + + .. _timeseries.slice_vs_exact_match: Slice vs. Exact Match diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 0ab536f2898c7..0b9c71f347153 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- +- :class:`DatetimeIndex` is now timezone aware when using strings as labels (:issue:`16785`) - .. _whatsnew_0211.deprecations: From 6a218e5f1f0bf14b7a233ff7c83c2dbc3723599d Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Thu, 16 Nov 2017 22:25:42 +0100 Subject: [PATCH 41/44] Fix variable name in documentation --- doc/source/timeseries.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 843a75aafb622..e31ebf9404b97 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -566,10 +566,14 @@ If no timezone is provided, then the same timezone as used in the ``DatetimeInde first_january_cet = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', freq='T', tz="CET") - df = pd.DataFrame(index=first_january_utc, data=np.arange(len(first_january_utc))) + + df = pd.DataFrame(index=first_january_cet, + data=np.arange(len(first_january_cet))) four_minute_slice = df["2016-01-01T00:00-01:00":"2016-01-01T02:03"] + four_minute_slice + ``DatetimeIndex`` partial string indexing can be used with naive labels as well. If a timezone is provided by the label, the datetime index is assumed to be UTC and a ``UserWarning`` is emitted. @@ -578,10 +582,13 @@ If a timezone is provided by the label, the datetime index is assumed to be UTC first_january_implicit_utc = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', freq='T') - df = pd.DataFrame(index=first_january_implicit_utc, data=np.arange(len(first_january_implicit_utc))) + + df = pd.DataFrame(index=first_january_implicit_utc, + data=np.arange(len(first_january_implicit_utc))) four_minute_slice = df["2016-01-01T00:00-02:00":"2016-01-01T02:03"] + four_minute_slice .. _timeseries.slice_vs_exact_match: From 577d7426f848b4f328b8d9df5ccca18d6b167ad9 Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Fri, 17 Nov 2017 08:43:46 +0100 Subject: [PATCH 42/44] Apply review suggestions. --- doc/source/timeseries.rst | 40 ++++++++++++++++++++------------- doc/source/whatsnew/v0.21.1.txt | 2 +- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index e31ebf9404b97..67835b859255a 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -557,36 +557,46 @@ We are stopping on the included end-point as it is part of the index dft2 = dft2.swaplevel(0, 1).sort_index() dft2.loc[idx[:, '2013-01-05'], :] +.. versionadded:: 0.21.1 -``DatetimeIndex`` partial string indexing also works with timezones. -If a timezone is provided by the label, that timezone is respected. -If no timezone is provided, then the same timezone as used in the ``DatetimeIndex`` is assumed. +``DatetimeIndex`` partial string indexing can be used with naive datetime-like labels when the ``DatetimeIndex`` has no timezone set. +If a timezone is provided by the label, the datetime index is assumed to be UTC and a ``UserWarning`` is emitted. + +.. note:: + + This both works with ``pd.Timestamp`` and strings .. ipython:: python + :okwarning: - first_january_cet = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', - freq='T', tz="CET") + first_january_implicit_utc = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T') - df = pd.DataFrame(index=first_january_cet, - data=np.arange(len(first_january_cet))) + df = pd.DataFrame(index=first_january_implicit_utc, + data=np.arange(len(first_january_implicit_utc))) - four_minute_slice = df["2016-01-01T00:00-01:00":"2016-01-01T02:03"] + df + + four_minute_slice = df["2016-01-01T00:00-02:00":"2016-01-01T02:03"] four_minute_slice -``DatetimeIndex`` partial string indexing can be used with naive labels as well. -If a timezone is provided by the label, the datetime index is assumed to be UTC and a ``UserWarning`` is emitted. +``DatetimeIndex`` partial string indexing is always well-defined on a ``DatetimeIndex`` with timezone information. +If a timezone is provided by the label, that timezone is respected. +If no timezone is provided, then the same timezone as used in the ``DatetimeIndex`` is assumed. .. ipython:: python - first_january_implicit_utc = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', - freq='T') + first_january_cet = pd.date_range('2016-01-01T00:00', '2016-01-01T23:59', + freq='T', tz="CET") - df = pd.DataFrame(index=first_january_implicit_utc, - data=np.arange(len(first_january_implicit_utc))) + df = pd.DataFrame(index=first_january_cet, + data=np.arange(len(first_january_cet))) - four_minute_slice = df["2016-01-01T00:00-02:00":"2016-01-01T02:03"] + df + + four_minute_slice = df["2016-01-01T00:00-01:00":"2016-01-01T02:03"] four_minute_slice diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 0b9c71f347153..5b52768832050 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- :class:`DatetimeIndex` is now timezone aware when using strings as labels (:issue:`16785`) +- :class:`DatetimeIndex` is now timezone aware when using strings as labels for indexing (:issue:`16785`) - .. _whatsnew_0211.deprecations: From 0e4c499e3489df9d3a2627a4f82b089f050f516f Mon Sep 17 00:00:00 2001 From: Marvin Kastner <1kastner@users.noreply.github.com> Date: Thu, 23 Nov 2017 21:18:47 +0100 Subject: [PATCH 43/44] Move change to bug and rename into result and expected --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/tests/indexing/test_partial.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5b52768832050..f26059c188814 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,6 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- :class:`DatetimeIndex` is now timezone aware when using strings as labels for indexing (:issue:`16785`) - .. _whatsnew_0211.deprecations: @@ -62,6 +61,7 @@ Bug Fixes - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) +- Bug in :class:`DatetimeIndex` when partial string label indices are actually timezone aware (:issue:`16785`) Conversion ^^^^^^^^^^ diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 0119f40c328a4..262ef5e879cea 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -648,17 +648,16 @@ def test_access_timezoned_datetimeindex_with_timezoned_label(self): df = pd.DataFrame(index=first_january, data=np.arange(len( first_january))) - former_naive_endpoint_df = df[ + result = df[ "2016-01-01T00:00-02:00":"2016-01-01T02:03" ] - former_non_naive_endpoint_df = df[ + expected = df[ pd.Timestamp("2016-01-01T00:00-02:00"): pd.Timestamp("2016-01-01T02:03") ] - tm.assert_frame_equal(former_naive_endpoint_df, - former_non_naive_endpoint_df) + tm.assert_frame_equal(result, expected) def test_access_naive_datetimeindex_with_timezoned_label(self): @@ -672,11 +671,11 @@ def test_access_naive_datetimeindex_with_timezoned_label(self): first_january))) with tm.assert_produces_warning(UserWarning): - former_naive_endpoint_df = df["2016-01-01T00:00-02:00": - "2016-01-01T02:03"] + result = df["2016-01-01T00:00-02:00":"2016-01-01T02:03"] - former_non_naive_endpoint_df = df[pd.Timestamp( - "2016-01-01T00:00-02:00"):pd.Timestamp("2016-01-01T02:03")] + expected = df[ + pd.Timestamp("2016-01-01T00:00-02:00"): + pd.Timestamp("2016-01-01T02:03") + ] - tm.assert_frame_equal(former_non_naive_endpoint_df, - former_naive_endpoint_df) + tm.assert_frame_equal(expected, result) From 8a2176ddb52a8e0ada623d06329f77ad468c7c9b Mon Sep 17 00:00:00 2001 From: Date: Sun, 26 Nov 2017 23:11:03 +0100 Subject: [PATCH 44/44] Adjust for flake8 --- pandas/tests/indexing/test_partial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index a979de72714d5..2ed1be276b34f 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -659,7 +659,7 @@ def test_access_timezoned_datetimeindex_with_timezoned_label_utc(self): tm.assert_frame_equal(result, expected) - def test_access_timezoned_datetimeindex_with_timezoned_label_in_other_timezone(self): + def test_access_timezoned_datetimeindex_with_timezoned_label_in_cet(self): # GH 6785 # timezone was ignored when string was provided as a label