diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 38629ee7baaea..3618c31b65116 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1367,6 +1367,31 @@ with duplicates dropped. idx1.sym_diff(idx2) idx1 ^ idx2 +Missing values +~~~~~~~~~~~~~~ + +.. _indexing.missing: + +.. versionadded:: 0.17.1 + +.. important:: + + Even though ``Index`` can hold missing values (``NaN``), it should be avoided + if you do not want any unexpected results. For example, some operations + exclude missing values implicitly. + +``Index.fillna`` fills missing values with specified scalar value. + +.. ipython:: python + + idx1 = pd.Index([1, np.nan, 3, 4]) + idx1 + idx1.fillna(2) + + idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]) + idx2 + idx2.fillna(pd.Timestamp('2011-01-02')) + Set / Reset Index ----------------- diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 28129287d51af..5d4c670620070 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -26,6 +26,12 @@ Enhancements - ``DataFrame`` now uses the fields of a ``namedtuple`` as columns, if columns are not supplied (:issue:`11181`) - Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`) +- ``Index`` now has ``fillna`` method (:issue:`10089`) + +.. ipython:: python + + pd.Index([1, np.nan, 3]).fillna(2) + .. _whatsnew_0171.api: API changes diff --git a/pandas/core/index.py b/pandas/core/index.py index 855e3f013bfd3..1de0f78fa6bb1 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -15,7 +15,8 @@ from pandas.compat import range, zip, lrange, lzip, u, map from pandas import compat from pandas.core import algorithms -from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate +from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, PandasDelegate +import pandas.core.base as base from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate, deprecate_kwarg) import pandas.core.common as com @@ -29,8 +30,6 @@ from pandas.io.common import PerformanceWarning - - # simplify default_pprint = lambda x, max_seq_items=None: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'), @@ -45,6 +44,7 @@ _index_doc_kwargs = dict(klass='Index', inplace='', duplicated='np.array') +_index_shared_docs = dict() def _try_get_item(x): @@ -108,6 +108,7 @@ class Index(IndexOpsMixin, PandasObject): _allow_datetime_index_ops = False _allow_period_index_ops = False _is_numeric_dtype = False + _can_hold_na = True _engine_type = _index.ObjectEngine @@ -1236,6 +1237,43 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): taken = self.values.take(indices) return self._shallow_copy(taken) + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + if self._can_hold_na: + return isnull(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values + + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + w, = self._isnan.nonzero() + return w + else: + return np.array([], dtype=np.int64) + + @cache_readonly + def hasnans(self): + """ return if I have any nans; enables various perf speedups """ + if self._can_hold_na: + return self._isnan.any() + else: + return False + + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + return value + + def _assert_can_do_op(self, value): + """ Check value is valid for scalar op """ + if not lib.isscalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) + def putmask(self, mask, value): """ return a new Index of the values set with the mask @@ -1245,8 +1283,12 @@ def putmask(self, mask, value): numpy.ndarray.putmask """ values = self.values.copy() - np.putmask(values, mask, value) - return self._shallow_copy(values) + try: + np.putmask(values, mask, self._convert_for_op(value)) + return self._shallow_copy(values) + except (ValueError, TypeError): + # coerces to object + return self.astype(object).putmask(mask, value) def format(self, name=False, formatter=None, **kwargs): """ @@ -2766,15 +2808,45 @@ def drop(self, labels, errors='raise'): return self.delete(indexer) @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs) + @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs) def drop_duplicates(self, keep='first'): return super(Index, self).drop_duplicates(keep=keep) @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): return super(Index, self).duplicated(keep=keep) + _index_shared_docs['fillna'] = """ + Fill NA/NaN values with the specified value + + Parameters + ---------- + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) + + Returns + ------- + filled : Index + """ + + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + + return self._shallow_copy() + def _evaluate_with_timedelta_like(self, other, op, opstr): raise TypeError("can only perform ops with timedelta like values") @@ -3200,6 +3272,16 @@ def __array__(self, dtype=None): """ the array interface, return my values """ return np.array(self._data, dtype=dtype) + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + return self._data.codes == -1 + + @Appender(_index_shared_docs['fillna']) + def fillna(self, value, downcast=None): + self._assert_can_do_op(value) + return CategoricalIndex(self._data.fillna(value), name=self.name) + def argsort(self, *args, **kwargs): return self.values.argsort(*args, **kwargs) @@ -3214,7 +3296,7 @@ def is_unique(self): return not self.duplicated().any() @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 return duplicated_int64(self.codes.astype('i8'), keep) @@ -3612,6 +3694,8 @@ class Int64Index(NumericIndex): _inner_indexer = _algos.inner_join_indexer_int64 _outer_indexer = _algos.outer_join_indexer_int64 + _can_hold_na = False + _engine_type = _index.Int64Engine def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): @@ -3646,11 +3730,6 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, * def inferred_type(self): return 'integer' - @cache_readonly - def hasnans(self): - # by definition - return False - @property def asi8(self): # do not cache or you'll create a memory leak @@ -3872,19 +3951,6 @@ def is_all_dates(self): """ return False - @cache_readonly - def _nan_idxs(self): - w, = self._isnan.nonzero() - return w - - @cache_readonly - def _isnan(self): - return np.isnan(self.values) - - @cache_readonly - def hasnans(self): - return self._isnan.any() - @cache_readonly def is_unique(self): return super(Float64Index, self).is_unique and self._nan_idxs.size < 2 @@ -4409,7 +4475,7 @@ def is_unique(self): return not self.duplicated().any() @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) - @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): from pandas.core.groupby import get_group_index from pandas.hashtable import duplicated_int64 @@ -4419,6 +4485,11 @@ def duplicated(self, keep='first'): return duplicated_int64(ids, keep) + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + # isnull is not implemented for MultiIndex + raise NotImplementedError('isnull is not defined for MultiIndex') + def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index cfc50afc8f9f3..43adbbb66b80e 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -452,7 +452,7 @@ def extract_ordinals(ndarray[object] values, freq): p = values[i] ordinals[i] = p.ordinal if p.freqstr != freqstr: - raise ValueError("%s is wrong freq" % p) + raise ValueError(_DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr)) return ordinals @@ -624,8 +624,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, return result -_DIFFERENT_FREQ_ERROR = "Input has different freq={1} from Period(freq={0})" - +_DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})" +_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})" cdef class Period(object): """ @@ -766,7 +766,7 @@ cdef class Period(object): if isinstance(other, Period): from pandas.tseries.frequencies import get_freq_code as _gfc if other.freq != self.freq: - msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr) + msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise ValueError(msg) if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: return _nat_scalar_rules[op] @@ -807,7 +807,7 @@ cdef class Period(object): else: ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) - msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr) + msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise ValueError(msg) else: # pragma no cover return NotImplemented diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index fb255f300ebdd..3a42059a63b0d 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -182,6 +182,15 @@ def f(): class Ops(tm.TestCase): + + def _allow_na_ops(self, obj): + """Whether to skip test cases including NaN""" + if (isinstance(obj, Index) and + (obj.is_boolean() or not obj._can_hold_na)): + # don't test boolean / int64 index + return False + return True + def setUp(self): self.bool_index = tm.makeBoolIndex(10, name='a') self.int_index = tm.makeIntIndex(10, name='a') @@ -452,13 +461,7 @@ def test_value_counts_unique_nunique(self): klass = type(o) values = o.values - if isinstance(o,Index) and o.is_boolean(): - # don't test boolean - continue - - if ((isinstance(o, Int64Index) and not isinstance(o, - (DatetimeIndex, PeriodIndex)))): - # skips int64 because it doesn't allow to include nan or None + if not self._allow_na_ops(o): continue # special assign to the numpy array @@ -815,6 +818,64 @@ def test_duplicated_drop_duplicates(self): s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original) + def test_fillna(self): + # # GH 11343 + # though Index.fillna and Series.fillna has separate impl, + # test here to confirm these works as the same + def get_fill_value(obj): + if isinstance(obj, pd.tseries.base.DatetimeIndexOpsMixin): + return obj.asobject.values[0] + else: + return obj.values[0] + + for o in self.objs: + klass = type(o) + values = o.values + + # values will not be changed + result = o.fillna(get_fill_value(o)) + if isinstance(o, Index): + self.assert_index_equal(o, result) + else: + self.assert_series_equal(o, result) + # check shallow_copied + self.assertFalse(o is result) + + for null_obj in [np.nan, None]: + for o in self.objs: + klass = type(o) + values = o.values.copy() + + if not self._allow_na_ops(o): + continue + + # value for filling + fill_value = get_fill_value(o) + + # special assign to the numpy array + if o.values.dtype == 'datetime64[ns]' or isinstance(o, PeriodIndex): + values[0:2] = pd.tslib.iNaT + else: + values[0:2] = null_obj + + if isinstance(o, PeriodIndex): + # freq must be specified because repeat makes freq ambiguous + expected = [fill_value.ordinal] * 2 + list(values[2:]) + expected = klass(ordinal=expected, freq=o.freq) + o = klass(ordinal=values, freq=o.freq) + else: + expected = [fill_value] * 2 + list(values[2:]) + expected = klass(expected) + o = klass(values) + + result = o.fillna(fill_value) + if isinstance(o, Index): + self.assert_index_equal(result, expected) + else: + self.assert_series_equal(result, expected) + # check shallow_copied + self.assertFalse(o is result) + class TestFloat64HashTable(tm.TestCase): def test_lookup_nan(self): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f7d93a978a46a..f897ac02b2a8a 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -559,6 +559,81 @@ def test_numpy_ufuncs(self): with tm.assertRaises(Exception): func(idx) + def test_hasnans_isnans(self): + # GH 11343, added tests for hasnans / isnans + for name, index in self.indices.items(): + if isinstance(index, MultiIndex): + pass + else: + idx = index.copy() + + # cases in indices doesn't include NaN + expected = np.array([False] * len(idx), dtype=bool) + self.assert_numpy_array_equal(idx._isnan, expected) + self.assertFalse(idx.hasnans) + + idx = index.copy() + values = idx.values + + if len(index) == 0: + continue + elif isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin): + values[1] = pd.tslib.iNaT + elif isinstance(index, Int64Index): + continue + else: + values[1] = np.nan + + if isinstance(index, PeriodIndex): + idx = index.__class__(values, freq=index.freq) + else: + idx = index.__class__(values) + + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + self.assert_numpy_array_equal(idx._isnan, expected) + self.assertTrue(idx.hasnans) + + def test_fillna(self): + # GH 11343 + for name, index in self.indices.items(): + if len(index) == 0: + pass + elif isinstance(index, MultiIndex): + idx = index.copy() + msg = "isnull is not defined for MultiIndex" + with self.assertRaisesRegexp(NotImplementedError, msg): + idx.fillna(idx[0]) + else: + idx = index.copy() + result = idx.fillna(idx[0]) + self.assert_index_equal(result, idx) + self.assertFalse(result is idx) + + msg = "'value' must be a scalar, passed: " + with self.assertRaisesRegexp(TypeError, msg): + idx.fillna([idx[0]]) + + idx = index.copy() + values = idx.values + + if isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin): + values[1] = pd.tslib.iNaT + elif isinstance(index, Int64Index): + continue + else: + values[1] = np.nan + + if isinstance(index, PeriodIndex): + idx = index.__class__(values, freq=index.freq) + else: + idx = index.__class__(values) + + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + self.assert_numpy_array_equal(idx._isnan, expected) + self.assertTrue(idx.hasnans) + class TestIndex(Base, tm.TestCase): _holder = Index @@ -2516,6 +2591,17 @@ def test_string_categorical_index_repr(self): categories=[u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', ...], ordered=False, dtype='category')""" self.assertEqual(unicode(idx), expected) + def test_fillna_categorical(self): + # GH 11343 + idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name='x') + # fill by value in categories + exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name='x') + self.assert_index_equal(idx.fillna(1.0), exp) + + # fill by value not in categories raises ValueError + with tm.assertRaisesRegexp(ValueError, 'fill value must be in categories'): + idx.fillna(2.0) + class Numeric(Base): @@ -2798,6 +2884,21 @@ def test_astype_from_object(self): tm.assert_equal(result.dtype, expected.dtype) tm.assert_index_equal(result, expected) + def test_fillna_float64(self): + # GH 11343 + idx = Index([1.0, np.nan, 3.0], dtype=float, name='x') + # can't downcast + exp = Index([1.0, 0.1, 3.0], name='x') + self.assert_index_equal(idx.fillna(0.1), exp) + + # downcast + exp = Int64Index([1, 2, 3], name='x') + self.assert_index_equal(idx.fillna(2), exp) + + # object + exp = Index([1, 'obj', 3], name='x') + self.assert_index_equal(idx.fillna('obj'), exp) + class TestInt64Index(Numeric, tm.TestCase): _holder = Int64Index @@ -3551,6 +3652,39 @@ def test_ufunc_coercions(self): tm.assert_index_equal(result, exp) self.assertEqual(result.freq, 'D') + def test_fillna_datetime64(self): + # GH 11343 + for tz in ['US/Eastern', 'Asia/Tokyo']: + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00']) + + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00']) + self.assert_index_equal(idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + + # tz mismatch + exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), pd.Timestamp('2011-01-01 10:00', tz=tz), + pd.Timestamp('2011-01-01 11:00')], dtype=object) + self.assert_index_equal(idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + + # object + exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x', + pd.Timestamp('2011-01-01 11:00')], dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) + + + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], tz=tz) + + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], tz=tz) + self.assert_index_equal(idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + + exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), pd.Timestamp('2011-01-01 10:00'), + pd.Timestamp('2011-01-01 11:00', tz=tz)], dtype=object) + self.assert_index_equal(idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + + # object + exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), 'x', + pd.Timestamp('2011-01-01 11:00', tz=tz)], dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) + class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex @@ -3633,6 +3767,21 @@ def test_period_index_indexer(self): self.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) self.assert_frame_equal(df, df.loc[list(idx)]) + def test_fillna_period(self): + # GH 11343 + idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], freq='H') + + exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], freq='H') + self.assert_index_equal(idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) + + exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), 'x', + pd.Period('2011-01-01 11:00', freq='H')], dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) + + with tm.assertRaisesRegexp(ValueError, 'Input has different freq=D from PeriodIndex\\(freq=H\\)'): + idx.fillna(pd.Period('2011-01-01', freq='D')) + + class TestTimedeltaIndex(DatetimeLike, tm.TestCase): _holder = TimedeltaIndex _multiprocess_can_split_ = True @@ -3751,6 +3900,19 @@ def test_ufunc_coercions(self): tm.assert_index_equal(result, exp) self.assertEqual(result.freq, None) + def test_fillna_timedelta(self): + # GH 11343 + idx = pd.TimedeltaIndex(['1 day', pd.NaT, '3 day']) + + exp = pd.TimedeltaIndex(['1 day', '2 day', '3 day']) + self.assert_index_equal(idx.fillna(pd.Timedelta('2 day')), exp) + + exp = pd.TimedeltaIndex(['1 day', '3 hour', '3 day']) + idx.fillna(pd.Timedelta('3 hour')) + + exp = pd.Index([pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) + self.assert_index_equal(idx.fillna('x'), exp) + class TestMultiIndex(Base, tm.TestCase): _holder = MultiIndex diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 50137493e6b01..d5382e8057f4b 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -201,10 +201,15 @@ def get_duplicates(self): values = Index.get_duplicates(self) return self._simple_new(values) + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + return (self.asi8 == tslib.iNaT) + @cache_readonly def hasnans(self): """ return if I have any nans; enables various perf speedups """ - return (self.asi8 == tslib.iNaT).any() + return self._isnan.any() @property def asobject(self): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 868057c675594..fd26e9834bd5f 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -499,6 +499,12 @@ def _generate(cls, start, end, periods, name, offset, def _box_func(self): return lambda x: Timestamp(x, offset=self.offset, tz=self.tz) + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + if self._has_same_tz(value): + return _to_m8(value) + raise ValueError('Passed item and index have different timezone') + def _local_timestamps(self): utc = _utc() @@ -548,6 +554,21 @@ def tzinfo(self): """ return self.tz + @cache_readonly + def _timezone(self): + """ Comparable timezone both for pytz / dateutil""" + return tslib.get_timezone(self.tzinfo) + + def _has_same_tz(self, other): + zzone = self._timezone + + # vzone sholdn't be None if value is non-datetime like + if isinstance(other, np.datetime64): + # convert to Timestamp as np.datetime64 doesn't have tz attr + other = Timestamp(other) + vzone = tslib.get_timezone(getattr(other, 'tzinfo', '__no_tz__')) + return zzone == vzone + @classmethod def _cached_range(cls, start=None, end=None, periods=None, offset=None, name=None): @@ -680,7 +701,7 @@ def _sub_datelike(self, other): other = Timestamp(other) # require tz compat - if tslib.get_timezone(self.tz) != tslib.get_timezone(other.tzinfo): + if not self._has_same_tz(other): raise TypeError("Timestamp subtraction must have the same timezones or no timezones") i8 = self.asi8 @@ -1552,17 +1573,9 @@ def equals(self, other): except: return False - if self.tz is not None: - if other.tz is None: - return False - same_zone = tslib.get_timezone( - self.tz) == tslib.get_timezone(other.tz) - else: - if other.tz is not None: - return False - same_zone = True - - return same_zone and np.array_equal(self.asi8, other.asi8) + if self._has_same_tz(other): + return np.array_equal(self.asi8, other.asi8) + return False def insert(self, loc, item): """ @@ -1581,10 +1594,10 @@ def insert(self, loc, item): """ freq = None + if isinstance(item, (datetime, np.datetime64)): - zone = tslib.get_timezone(self.tz) - izone = tslib.get_timezone(getattr(item, 'tzinfo', None)) - if zone != izone: + self._assert_can_do_op(item) + if not self._has_same_tz(item): raise ValueError('Passed item and index have different timezone') # check freq can be preserved on edge cases if self.size and self.freq is not None: diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 888c50e86b7b2..578727f515fe4 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -57,7 +57,7 @@ def dt64arr_to_periodarr(data, freq, tz): # --- Period index sketch -_DIFFERENT_FREQ_ERROR = "Input has different freq={1} from PeriodIndex(freq={0})" +_DIFFERENT_FREQ_INDEX = period._DIFFERENT_FREQ_INDEX def _period_index_cmp(opname, nat_result=False): """ @@ -68,13 +68,13 @@ def wrapper(self, other): func = getattr(self.values, opname) other_base, _ = _gfc(other.freq) if other.freq != self.freq: - msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr) + msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise ValueError(msg) result = func(other.ordinal) elif isinstance(other, PeriodIndex): if other.freq != self.freq: - msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr) + msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise ValueError(msg) result = getattr(self.values, opname)(other.values) @@ -336,6 +336,10 @@ def __array_wrap__(self, result, context=None): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) + def _convert_for_op(self): + """ Convert value to be insertable to ndarray """ + return self._box_func(value) + def _to_embed(self, keep_tz=False): """ return an array repr of this object, potentially casting to object """ return self.asobject.values @@ -378,7 +382,7 @@ def astype(self, dtype): def searchsorted(self, key, side='left'): if isinstance(key, Period): if key.freq != self.freq: - msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, key.freqstr) + msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, key.freqstr) raise ValueError(msg) key = key.ordinal elif isinstance(key, compat.string_types): @@ -764,7 +768,7 @@ def _assert_can_do_setop(self, other): raise ValueError('can only call with other PeriodIndex-ed objects') if self.freq != other.freq: - msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr) + msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise ValueError(msg) def _wrap_union_result(self, other, result): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index afb15badf433c..0d47c2526df14 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -642,7 +642,7 @@ class NaTType(_NaT): def __reduce__(self): return (__nat_unpickle, (None, )) - + def total_seconds(self): # GH 10939 return np.nan @@ -1749,7 +1749,8 @@ def dateutil_parse(object timestr, object default, ignoretz=False, res, _ = res if res is None: - raise ValueError("unknown string format") + msg = "Unknown datetime string format, unable to parse: {0}" + raise ValueError(msg.format(timestr)) for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: @@ -1759,7 +1760,8 @@ def dateutil_parse(object timestr, object default, ignoretz=False, reso = attr if reso is None: - raise ValueError("Cannot parse date.") + msg = "Unable to parse datetime string: {0}" + raise ValueError(msg.format(timestr)) if reso == 'microsecond': if repl['microsecond'] == 0: