From 0e40ed42c7439d4b8e0c6680f5aaa5c30a64c6d0 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 08:14:30 -0700 Subject: [PATCH 1/6] Simplify Period/Datetime Array/Index constructors --- pandas/core/arrays/datetimelike.py | 32 +++++++++++++++++++ pandas/core/arrays/datetimes.py | 8 ++++- pandas/core/arrays/period.py | 15 ++++++--- pandas/core/arrays/timedeltas.py | 15 ++++----- pandas/core/indexes/datetimes.py | 4 --- pandas/core/indexes/period.py | 28 ++-------------- .../tests/indexes/period/test_construction.py | 8 ++--- 7 files changed, 63 insertions(+), 47 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e4ace2bfe1509..5aa9d7aa1be75 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -18,6 +18,7 @@ from pandas.tseries.offsets import Tick, DateOffset from pandas.core.dtypes.common import ( + pandas_dtype, needs_i8_conversion, is_list_like, is_offsetlike, @@ -911,3 +912,34 @@ def validate_tz_from_dtype(dtype, tz): except TypeError: pass return tz + + +def validate_dtype_freq(dtype, freq): + """ + If both a dtype and a freq are available, ensure they match. If only + dtype is available, extract the implied freq. + + Parameters + ---------- + dtype : dtype + freq : DateOffset or None + + Returns + ------- + freq : DateOffset + + Raises + ------ + ValueError : non-period dtype + IncompatibleFrequency : mismatch between dtype and freq + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + if not is_period_dtype(dtype): + raise ValueError('dtype must be PeriodDtype') + if freq is None: + freq = dtype.freq + elif freq != dtype.freq: + raise IncompatibleFrequency('specified freq and dtype ' + 'are different') + return freq diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7daaa8de1734f..4c75927135b22 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -222,6 +222,12 @@ def __new__(cls, values, freq=None, tz=None, dtype=None): @classmethod def _generate_range(cls, start, end, periods, freq, tz=None, normalize=False, ambiguous='raise', closed=None): + + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError('Must provide freq argument if no data is ' + 'supplied') + if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, ' 'and freq, exactly three must be specified') @@ -264,7 +270,7 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if freq is not None: if cls._use_cached_range(freq, _normalized, start, end): # Currently always False; never hit - # Should be reimplemented as apart of GH 17914 + # Should be reimplemented as a part of GH#17914 index = cls._cached_range(start, end, periods=periods, freq=freq) else: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 6d13fb9ecaa39..d32ff76c0819b 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -27,6 +27,7 @@ from pandas.tseries import frequencies from pandas.tseries.offsets import Tick, DateOffset +from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin @@ -132,7 +133,7 @@ def __new__(cls, values, freq=None, **kwargs): # TODO: what if it has tz? values = dt64arr_to_periodarr(values, freq) - return cls._simple_new(values, freq, **kwargs) + return cls._simple_new(values, freq=freq, **kwargs) @classmethod def _simple_new(cls, values, freq=None, **kwargs): @@ -141,21 +142,27 @@ def _simple_new(cls, values, freq=None, **kwargs): Ordinals in an ndarray are fastpath-ed to `_from_ordinals` """ + if is_period_dtype(values): + freq = dtl.validate_dtype_freq(values.dtype, freq) + values = values.asi8 + if not is_integer_dtype(values): values = np.array(values, copy=False) if len(values) > 0 and is_float_dtype(values): raise TypeError("{cls} can't take floats" .format(cls=cls.__name__)) - return cls(values, freq=freq) + return cls(values, freq=freq, **kwargs) - return cls._from_ordinals(values, freq) + return cls._from_ordinals(values, freq=freq, **kwargs) @classmethod - def _from_ordinals(cls, values, freq=None): + def _from_ordinals(cls, values, freq=None, **kwargs): """ Values should be int ordinals `__new__` & `_simple_new` cooerce to ordinals and call this method """ + # **kwargs are included so that the signature matches PeriodIndex, + # letting us share _simple_new values = np.array(values, dtype='int64', copy=False) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index df9e57cb5f0e1..4904a90ab7b2b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -131,15 +131,6 @@ def __new__(cls, values, freq=None, start=None, end=None, periods=None, freq, freq_infer = dtl.maybe_infer_freq(freq) - if values is None: - # TODO: Remove this block and associated kwargs; GH#20535 - if freq is None and com._any_none(periods, start, end): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') - periods = dtl.validate_periods(periods) - return cls._generate_range(start, end, periods, freq, - closed=closed) - result = cls._simple_new(values, freq=freq) if freq_infer: inferred = result.inferred_freq @@ -151,6 +142,12 @@ def __new__(cls, values, freq=None, start=None, end=None, periods=None, @classmethod def _generate_range(cls, start, end, periods, freq, closed=None, **kwargs): # **kwargs are for compat with TimedeltaIndex, which includes `name` + + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError('Must provide freq argument if no data is ' + 'supplied') + if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, ' 'and freq, exactly three must be specified') diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e40ceadc1a083..f5b426bee74c8 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -294,10 +294,6 @@ def __new__(cls, data=None, if data is None: # TODO: Remove this block and associated kwargs; GH#20535 - if freq is None and com._any_none(periods, start, end): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') - periods = dtl.validate_periods(periods) return cls._generate_range(start, end, periods, name, freq, tz=tz, normalize=normalize, closed=closed, ambiguous=ambiguous) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 7833dd851db34..2a0e530b76191 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -181,15 +181,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if name is None and hasattr(data, 'name'): name = data.name - if dtype is not None: - dtype = pandas_dtype(dtype) - if not is_period_dtype(dtype): - raise ValueError('dtype must be PeriodDtype') - if freq is None: - freq = dtype.freq - elif freq != dtype.freq: - msg = 'specified freq and dtype are different' - raise IncompatibleFrequency(msg) + freq = dtl.validate_dtype_freq(dtype, freq) # coerce freq to freq object, otherwise it can be coerced elementwise # which is slow @@ -218,7 +210,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, # not array / index if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): - if is_scalar(data) or isinstance(data, Period): + if is_scalar(data): cls._scalar_data_error(data) # other iterable of some kind @@ -248,21 +240,7 @@ def _engine(self): return self._engine_type(lambda: self, len(self)) @classmethod - def _simple_new(cls, values, name=None, freq=None, **kwargs): - """ - Values can be any type that can be coerced to Periods. - Ordinals in an ndarray are fastpath-ed to `_from_ordinals` - """ - if not is_integer_dtype(values): - values = np.array(values, copy=False) - if len(values) > 0 and is_float_dtype(values): - raise TypeError("PeriodIndex can't take floats") - return cls(values, name=name, freq=freq, **kwargs) - - return cls._from_ordinals(values, name, freq, **kwargs) - - @classmethod - def _from_ordinals(cls, values, name=None, freq=None, **kwargs): + def _from_ordinals(cls, values, freq=None, name=None): """ Values should be int ordinals `__new__` & `_simple_new` cooerce to ordinals and call this method diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index be741592ec7a2..d54dac5867845 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -264,20 +264,20 @@ def test_constructor_mixed(self): def test_constructor_simple_new(self): idx = period_range('2007-01', name='p', periods=2, freq='M') - result = idx._simple_new(idx, 'p', freq=idx.freq) + result = idx._simple_new(idx, name='p', freq=idx.freq) tm.assert_index_equal(result, idx) - result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) + result = idx._simple_new(idx.astype('i8'), name='p', freq=idx.freq) tm.assert_index_equal(result, idx) result = idx._simple_new([pd.Period('2007-01', freq='M'), pd.Period('2007-02', freq='M')], - 'p', freq=idx.freq) + name='p', freq=idx.freq) tm.assert_index_equal(result, idx) result = idx._simple_new(np.array([pd.Period('2007-01', freq='M'), pd.Period('2007-02', freq='M')]), - 'p', freq=idx.freq) + name='p', freq=idx.freq) tm.assert_index_equal(result, idx) def test_constructor_simple_new_empty(self): From dc6307264f30ae6f6a9e6d8abaa3db32d314a01c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 08:40:38 -0700 Subject: [PATCH 2/6] revert removal of kwargs --- pandas/core/indexes/period.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 2a0e530b76191..ce3efb393d40a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -240,7 +240,7 @@ def _engine(self): return self._engine_type(lambda: self, len(self)) @classmethod - def _from_ordinals(cls, values, freq=None, name=None): + def _from_ordinals(cls, values, freq=None, name=None, **kwargs): """ Values should be int ordinals `__new__` & `_simple_new` cooerce to ordinals and call this method From ee210e46b5adb891538764279a5fee35a7f3ac7a Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 10:26:18 -0700 Subject: [PATCH 3/6] fix _simple_new calls in pytables --- pandas/io/pytables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ff37036533b4f..956b72d30893d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2476,7 +2476,7 @@ def _get_index_factory(self, klass): if klass == DatetimeIndex: def f(values, freq=None, tz=None): # data are already in UTC, localize and convert if tz present - result = DatetimeIndex._simple_new(values.values, None, + result = DatetimeIndex._simple_new(values.values, name=None, freq=freq) if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) @@ -2484,7 +2484,7 @@ def f(values, freq=None, tz=None): return f elif klass == PeriodIndex: def f(values, freq=None, tz=None): - return PeriodIndex._simple_new(values, None, freq=freq) + return PeriodIndex._simple_new(values, name=None, freq=freq) return f return klass From fc88e59d2b7d1ce32d2c3ceba046bf9aab7878c6 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 11:39:53 -0700 Subject: [PATCH 4/6] Fixup unused import --- pandas/core/indexes/period.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ce3efb393d40a..144e4dcaaded3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -8,7 +8,6 @@ is_integer, is_float, is_integer_dtype, - is_float_dtype, is_scalar, is_datetime64_dtype, is_datetime64_any_dtype, From 69a8f7ec6fc0b12fc63739a3aaf95d85911c080d Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 11:41:45 -0700 Subject: [PATCH 5/6] override simple_new instead of _from_ordinasl --- pandas/core/indexes/period.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 144e4dcaaded3..70eb2eb33dbc4 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -239,12 +239,8 @@ def _engine(self): return self._engine_type(lambda: self, len(self)) @classmethod - def _from_ordinals(cls, values, freq=None, name=None, **kwargs): - """ - Values should be int ordinals - `__new__` & `_simple_new` cooerce to ordinals and call this method - """ - result = super(PeriodIndex, cls)._from_ordinals(values, freq) + def _simple_new(cls, values, freq=None, name=None, **kwargs): + result = super(PeriodIndex, cls)._simple_new(values, freq) result.name = name result._reset_identity() From 04c75ca93ebaf693f9a60c7ff609806300083f88 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 15:13:38 -0700 Subject: [PATCH 6/6] Call simple_new instead of from_ordinals --- pandas/core/indexes/period.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 70eb2eb33dbc4..ebecf62ecf754 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -193,7 +193,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, else: data, freq = cls._generate_range(start, end, periods, freq, fields) - return cls._from_ordinals(data, name=name, freq=freq) + return cls._simple_new(data, name=name, freq=freq) if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change @@ -221,7 +221,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, # datetime other than period if is_datetime64_dtype(data.dtype): data = dt64arr_to_periodarr(data, freq, tz) - return cls._from_ordinals(data, name=name, freq=freq) + return cls._simple_new(data, name=name, freq=freq) # check not floats if infer_dtype(data) == 'floating' and len(data) > 0: @@ -232,7 +232,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, data = ensure_object(data) freq = freq or period.extract_freq(data) data = period.extract_ordinals(data, freq) - return cls._from_ordinals(data, name=name, freq=freq) + return cls._simple_new(data, name=name, freq=freq) @cache_readonly def _engine(self): @@ -676,8 +676,8 @@ def _wrap_union_result(self, other, result): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._from_ordinals(rawarr, freq=self.freq, - name=self.name) + rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, + name=self.name) return rawarr def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs):