diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index dca4f890e496b..1f3ddb8923bac 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -585,6 +585,7 @@ Bug Fixes - Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) - Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) +- Cleaned up ``PeriodIndex`` constructor, including raising on floats more consistently (:issue:`13277`) - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 55d404f05dd1d..d37c98c9b9b90 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -471,8 +471,8 @@ def _value_counts_arraylike(values, dropna=True): # dtype handling if is_datetimetz_type: keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) - if is_period_type: - keys = PeriodIndex._simple_new(keys, freq=freq) + elif is_period_type: + keys = PeriodIndex._from_ordinals(keys, freq=freq) elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 5d43d2d32af67..e441d9a88690d 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -88,6 +88,11 @@ def _new_Index(cls, d): """ This is called upon unpickling, rather than the default which doesn't have arguments and breaks __new__ """ + # required for backward compat, because PI can't be instantiated with + # ordinals through __new__ GH #13277 + if issubclass(cls, ABCPeriodIndex): + from pandas.tseries.period import _new_PeriodIndex + return _new_PeriodIndex(cls, **d) return cls.__new__(cls, **d) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 7afe8a06b6af1..39bc1a4ecf225 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -573,7 +573,7 @@ def decode(obj): elif typ == u'period_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq']) - return globals()[obj[u'klass']](data, **d) + return globals()[obj[u'klass']]._from_ordinals(data, **d) elif typ == u'datetime_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 228615829b5b8..f13a84f4f0e92 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -120,7 +120,7 @@ def test_constructor_fromarraylike(self): self.assertRaises(ValueError, PeriodIndex, idx._values) self.assertRaises(ValueError, PeriodIndex, list(idx._values)) - self.assertRaises(ValueError, PeriodIndex, + self.assertRaises(TypeError, PeriodIndex, data=Period('2007', freq='A')) result = PeriodIndex(iter(idx)) @@ -285,12 +285,15 @@ def test_constructor_simple_new_empty(self): result = idx._simple_new(idx, name='p', freq='M') tm.assert_index_equal(result, idx) - def test_constructor_simple_new_floats(self): + def test_constructor_floats(self): # GH13079 - for floats in [[1.1], np.array([1.1])]: + for floats in [[1.1, 2.1], np.array([1.1, 2.1])]: with self.assertRaises(TypeError): pd.PeriodIndex._simple_new(floats, freq='M') + with self.assertRaises(TypeError): + pd.PeriodIndex(floats, freq='M') + def test_constructor_nat(self): self.assertRaises(ValueError, period_range, start='NaT', end='2011-01-01', freq='M') diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8a6b0c153bb50..ca61394fc0423 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -17,7 +17,6 @@ is_period_dtype, is_bool_dtype, pandas_dtype, - _ensure_int64, _ensure_object) from pandas.types.dtypes import PeriodDtype from pandas.types.generic import ABCSeries @@ -114,6 +113,13 @@ def wrapper(self, other): return wrapper +def _new_PeriodIndex(cls, **d): + # GH13277 for unpickling + if d['data'].dtype == 'int64': + values = d.pop('data') + return cls._from_ordinals(values=values, **d) + + class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray holding ordinal values indicating regular periods in @@ -209,17 +215,56 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, msg = 'specified freq and dtype are different' raise IncompatibleFrequency(msg) + # coerce freq to freq object, otherwise it can be coerced elementwise + # which is slow + if freq: + freq = Period._maybe_convert_freq(freq) + if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) else: data, freq = cls._generate_range(start, end, periods, freq, kwargs) - else: - ordinal, freq = cls._from_arraylike(data, freq, tz) - data = np.array(ordinal, dtype=np.int64, copy=copy) + return cls._from_ordinals(data, name=name, freq=freq) - return cls._simple_new(data, name=name, freq=freq) + if isinstance(data, PeriodIndex): + if freq is None or freq == data.freq: # no freq change + freq = data.freq + data = data._values + else: + base1, _ = _gfc(data.freq) + base2, _ = _gfc(freq) + data = period.period_asfreq_arr(data._values, + base1, base2, 1) + return cls._simple_new(data, name=name, freq=freq) + + # not array / index + if not isinstance(data, (np.ndarray, PeriodIndex, + DatetimeIndex, Int64Index)): + if is_scalar(data) or isinstance(data, Period): + cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + + data = np.asarray(data) + + # datetime other than period + if is_datetime64_dtype(data.dtype): + data = dt64arr_to_periodarr(data, freq, tz) + return cls._from_ordinals(data, name=name, freq=freq) + + # check not floats + if infer_dtype(data) == 'floating' and len(data) > 0: + raise TypeError("PeriodIndex can't take floats") + + # anything else, likely an array of strings or periods + data = _ensure_object(data) + freq = freq or period.extract_freq(data) + data = period.extract_ordinals(data, freq) + return cls._from_ordinals(data, name=name, freq=freq) @classmethod def _generate_range(cls, start, end, periods, freq, fields): @@ -240,77 +285,26 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq - @classmethod - def _from_arraylike(cls, data, freq, tz): - if freq is not None: - freq = Period._maybe_convert_freq(freq) - - if not isinstance(data, (np.ndarray, PeriodIndex, - DatetimeIndex, Int64Index)): - if is_scalar(data) or isinstance(data, Period): - raise ValueError('PeriodIndex() must be called with a ' - 'collection of some kind, %s was passed' - % repr(data)) - - # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - - try: - data = _ensure_int64(data) - if freq is None: - raise ValueError('freq not specified') - data = np.array([Period(x, freq=freq) for x in data], - dtype=np.int64) - except (TypeError, ValueError): - data = _ensure_object(data) - - if freq is None: - freq = period.extract_freq(data) - data = period.extract_ordinals(data, freq) - else: - if isinstance(data, PeriodIndex): - if freq is None or freq == data.freq: - freq = data.freq - data = data._values - else: - base1, _ = _gfc(data.freq) - base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._values, - base1, base2, 1) - else: - if is_object_dtype(data): - inferred = infer_dtype(data) - if inferred == 'integer': - data = data.astype(np.int64) - - if freq is None and is_object_dtype(data): - # must contain Period instance and thus extract ordinals - freq = period.extract_freq(data) - data = period.extract_ordinals(data, freq) - - if freq is None: - msg = 'freq not specified and cannot be inferred' - raise ValueError(msg) - - if data.dtype != np.int64: - if np.issubdtype(data.dtype, np.datetime64): - data = dt64arr_to_periodarr(data, freq, tz) - else: - data = _ensure_object(data) - data = period.extract_ordinals(data, freq) - - return data, freq - @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - + """ + Values can be any type that can be coerced to Periods. + Ordinals in an ndarray are fastpath-ed to `_from_ordinals` + """ if not is_integer_dtype(values): values = np.array(values, copy=False) - if (len(values) > 0 and is_float_dtype(values)): + if len(values) > 0 and is_float_dtype(values): raise TypeError("PeriodIndex can't take floats") - else: - return cls(values, name=name, freq=freq, **kwargs) + return cls(values, name=name, freq=freq, **kwargs) + + return cls._from_ordinals(values, name, freq, **kwargs) + + @classmethod + def _from_ordinals(cls, values, name=None, freq=None, **kwargs): + """ + Values should be int ordinals + `__new__` & `_simple_new` cooerce to ordinals and call this method + """ values = np.array(values, dtype='int64', copy=False) @@ -318,7 +312,7 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): result._data = values result.name = name if freq is None: - raise ValueError('freq is not specified') + raise ValueError('freq is not specified and cannot be inferred') result.freq = Period._maybe_convert_freq(freq) result._reset_identity() return result @@ -327,13 +321,13 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): """ we always want to return a PeriodIndex """ return self._shallow_copy(values=values, **kwargs) - def _shallow_copy(self, values=None, **kwargs): - if kwargs.get('freq') is None: - # freq must be provided - kwargs['freq'] = self.freq + def _shallow_copy(self, values=None, freq=None, **kwargs): + if freq is None: + freq = self.freq if values is None: values = self._values - return super(PeriodIndex, self)._shallow_copy(values=values, **kwargs) + return super(PeriodIndex, self)._shallow_copy(values=values, + freq=freq, **kwargs) def _coerce_scalar_to_index(self, item): """ @@ -413,7 +407,7 @@ def __array_wrap__(self, result, context=None): return result # the result is object dtype array of Period # cannot pass _simple_new as it is - return PeriodIndex(result, freq=self.freq, name=self.name) + return self._shallow_copy(result, freq=self.freq, name=self.name) @property def _box_func(self): @@ -708,7 +702,7 @@ def shift(self, n): values = self._values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT - return PeriodIndex(data=values, name=self.name, freq=self.freq) + return self._shallow_copy(values=values) @cache_readonly def dtype(self): @@ -945,7 +939,8 @@ def _wrap_union_result(self, other, result): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex(rawarr, freq=self.freq) + rawarr = PeriodIndex._from_ordinals(rawarr, freq=self.freq, + name=self.name) return rawarr def _format_native_types(self, na_rep=u('NaT'), date_format=None, diff --git a/setup.cfg b/setup.cfg index b9de7a3532209..8de4fc955bd50 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,6 +13,7 @@ parentdir_prefix = pandas- [flake8] ignore = E731,E402 +max-line-length = 79 [yapf] based_on_style = pep8