From b208a9e37ce036db71aa342b7cffd916e5672ff8 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 12 Jul 2016 01:39:31 +0900 Subject: [PATCH] API: Index doesn't results in PeriodIndex if Period contains NaT --- doc/source/whatsnew/v0.19.0.txt | 3 +- pandas/core/ops.py | 11 ++- pandas/indexes/base.py | 31 +++++---- pandas/src/inference.pyx | 34 +++++++-- pandas/tests/indexes/test_datetimelike.py | 84 +++++++++++++++++------ pandas/tests/types/test_inference.py | 27 ++++++++ pandas/tseries/base.py | 7 +- pandas/tseries/tests/test_base.py | 6 +- 8 files changed, 153 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 0b9695125c0a9..6755b54b195ef 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -267,6 +267,8 @@ API changes - ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) - ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) +- Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) +- ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) .. _whatsnew_0190.api.tolist: @@ -601,7 +603,6 @@ Bug Fixes - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Clean some compile time warnings in datetime parsing (:issue:`13607`) - - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index d76f011df3dd8..44e3be32c23df 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -31,7 +31,7 @@ is_list_like, _ensure_object) from pandas.types.cast import _maybe_upcast_putmask -from pandas.types.generic import ABCSeries, ABCIndex +from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -773,6 +773,15 @@ def wrapper(self, other, axis=None): if (not lib.isscalar(lib.item_from_zerodim(other)) and len(self) != len(other)): raise ValueError('Lengths must match to compare') + + if isinstance(other, ABCPeriodIndex): + # temp workaround until fixing GH 13637 + # tested in test_nat_comparisons + # (pandas.tests.series.test_operators.TestSeriesOperators) + return self._constructor(na_op(self.values, + other.asobject.values), + index=self.index) + return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index b013d6ccb0b8e..3b0e8327e5509 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -224,7 +224,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, pass # maybe coerce to a sub-class - from pandas.tseries.period import PeriodIndex + from pandas.tseries.period import (PeriodIndex, + IncompatibleFrequency) if isinstance(data, PeriodIndex): return PeriodIndex(data, copy=copy, name=name, **kwargs) if issubclass(data.dtype.type, np.integer): @@ -265,13 +266,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - elif (inferred.startswith('timedelta') or - lib.is_timedelta_array(subarr)): + elif inferred.startswith('timedelta'): from pandas.tseries.tdi import TimedeltaIndex return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == 'period': - return PeriodIndex(subarr, name=name, **kwargs) + try: + return PeriodIndex(subarr, name=name, **kwargs) + except IncompatibleFrequency: + pass return cls._simple_new(subarr, name) elif hasattr(data, '__array__'): @@ -866,6 +869,16 @@ def _convert_can_do_setop(self, other): result_name = self.name if self.name == other.name else None return other, result_name + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + return value + + def _assert_can_do_op(self, value): + """ Check value is valid for scalar op """ + if not lib.isscalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) + @property def nlevels(self): return 1 @@ -1508,16 +1521,6 @@ def hasnans(self): else: return False - def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - return value - - def _assert_can_do_op(self, value): - """ Check value is valid for scalar op """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - def putmask(self, mask, value): """ return a new Index of the values set with the mask diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 9f96037c97c62..fe4748eb0eba0 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -270,7 +270,7 @@ cdef inline bint is_null_datetimelike(v): cdef inline bint is_null_datetime64(v): - # determine if we have a null for a datetime (or integer versions)x, + # determine if we have a null for a datetime (or integer versions), # excluding np.timedelta64('nat') if util._checknull(v): return True @@ -282,7 +282,7 @@ cdef inline bint is_null_datetime64(v): cdef inline bint is_null_timedelta64(v): - # determine if we have a null for a timedelta (or integer versions)x, + # determine if we have a null for a timedelta (or integer versions), # excluding np.datetime64('nat') if util._checknull(v): return True @@ -293,6 +293,16 @@ cdef inline bint is_null_timedelta64(v): return False +cdef inline bint is_null_period(v): + # determine if we have a null for a Period (or integer versions), + # excluding np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(v): + return True + elif v is NaT: + return True + return False + + cdef inline bint is_datetime(object o): return PyDateTime_Check(o) @@ -531,6 +541,7 @@ def is_timedelta_array(ndarray values): return False return null_count != n + def is_timedelta64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -546,6 +557,7 @@ def is_timedelta64_array(ndarray values): return False return null_count != n + def is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ cdef Py_ssize_t i, null_count = 0, n = len(values) @@ -562,6 +574,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return False return null_count != n + def is_date_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -571,6 +584,7 @@ def is_date_array(ndarray[object] values): return False return True + def is_time_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -582,15 +596,21 @@ def is_time_array(ndarray[object] values): def is_period_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - from pandas.tseries.period import Period - + cdef Py_ssize_t i, null_count = 0, n = len(values) + cdef object v if n == 0: return False + + # return False for all nulls for i in range(n): - if not isinstance(values[i], Period): + v = values[i] + if is_null_period(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not is_period(v): return False - return True + return null_count != n cdef extern from "parse_helper.h": diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 5c21f71d64660..af44767ae5be5 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -119,10 +119,10 @@ def test_pickle_compat_construction(self): def test_construction_index_with_mixed_timezones(self): # GH 11488 # no tz results in DatetimeIndex - result = Index( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + result = Index([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNone(result.tz) @@ -295,9 +295,9 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') - ], tz='Asia/Tokyo', name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00')], + tz='Asia/Tokyo', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) @@ -338,6 +338,17 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + def test_construction_base_constructor(self): + arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + def test_astype(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) @@ -699,12 +710,11 @@ def test_fillna_datetime64(self): pd.Timestamp('2011-01-01 11:00')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - idx = pd.DatetimeIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], tz=tz) + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], tz=tz) - exp = pd.DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], tz=tz) + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], tz=tz) self.assert_index_equal( idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) @@ -734,6 +744,26 @@ def setUp(self): def create_index(self): return period_range('20130101', periods=5, freq='D') + def test_construction_base_constructor(self): + # GH 13664 + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='D')] + tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) + + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.Index(np.array(arr), dtype=object)) + def test_astype(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') @@ -874,7 +904,6 @@ def test_repeat(self): self.assertEqual(res.freqstr, 'D') def test_period_index_indexer(self): - # GH4125 idx = pd.period_range('2002-01', '2003-12', freq='M') df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) @@ -886,12 +915,11 @@ def test_period_index_indexer(self): def test_fillna_period(self): # GH 11343 - idx = pd.PeriodIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], freq='H') + idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], freq='H') - exp = pd.PeriodIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], freq='H') + exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H') self.assert_index_equal( idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) @@ -899,10 +927,11 @@ def test_fillna_period(self): pd.Period('2011-01-01 11:00', freq='H')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - with tm.assertRaisesRegexp( - ValueError, - 'Input has different freq=D from PeriodIndex\\(freq=H\\)'): - idx.fillna(pd.Period('2011-01-01', freq='D')) + exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), + pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-01 11:00', freq='H')], dtype=object) + self.assert_index_equal(idx.fillna(pd.Period('2011-01-01', freq='D')), + exp) def test_no_millisecond_field(self): with self.assertRaises(AttributeError): @@ -923,6 +952,17 @@ def setUp(self): def create_index(self): return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + def test_construction_base_constructor(self): + arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timedelta('1 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + def test_shift(self): # test shift for TimedeltaIndex # err8083 diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 34d10ee9dfa42..9a12220f5b41d 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -431,6 +431,33 @@ def test_infer_dtype_timedelta(self): dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed') + def test_infer_dtype_period(self): + # GH 13664 + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='M')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Period('2011-01', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([n, pd.Period('2011-01', freq='D'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # different type of nat + arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) self.assertEqual(lib.infer_dtype(arr), 'floating') diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index fe0440170383b..188f538372092 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -800,12 +800,15 @@ def _ensure_datetimelike_to_i8(other): if lib.isscalar(other) and isnull(other): other = tslib.iNaT elif isinstance(other, ABCIndexClass): - # convert tz if needed if getattr(other, 'tz', None) is not None: other = other.tz_localize(None).asi8 else: other = other.asi8 else: - other = np.array(other, copy=False).view('i8') + try: + other = np.array(other, copy=False).view('i8') + except TypeError: + # period array cannot be coerces to int + other = Index(other).asi8 return other diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 958a10c329a46..2243f12a6cd06 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -1731,9 +1731,9 @@ def test_representation_to_series(self): 2 2013 dtype: object""" - exp6 = """0 2011-01-01 09:00 -1 2012-02-01 10:00 -2 NaT + exp6 = """0 2011-01-01 09:00 +1 2012-02-01 10:00 +2 NaT dtype: object""" exp7 = """0 2013Q1