Skip to content

REF: Simplify Period/Datetime Array/Index constructors #23093

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 12, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas.tseries.offsets import Tick, DateOffset

from pandas.core.dtypes.common import (
pandas_dtype,
needs_i8_conversion,
is_list_like,
is_offsetlike,
Expand Down Expand Up @@ -911,3 +912,34 @@ def validate_tz_from_dtype(dtype, tz):
except TypeError:
pass
return tz


def validate_dtype_freq(dtype, freq):
"""
If both a dtype and a freq are available, ensure they match. If only
dtype is available, extract the implied freq.

Parameters
----------
dtype : dtype
freq : DateOffset or None

Returns
-------
freq : DateOffset

Raises
------
ValueError : non-period dtype
IncompatibleFrequency : mismatch between dtype and freq
"""
if dtype is not None:
dtype = pandas_dtype(dtype)
if not is_period_dtype(dtype):
raise ValueError('dtype must be PeriodDtype')
if freq is None:
freq = dtype.freq
elif freq != dtype.freq:
raise IncompatibleFrequency('specified freq and dtype '
'are different')
return freq
8 changes: 7 additions & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,12 @@ def __new__(cls, values, freq=None, tz=None, dtype=None):
@classmethod
def _generate_range(cls, start, end, periods, freq, tz=None,
normalize=False, ambiguous='raise', closed=None):

periods = dtl.validate_periods(periods)
if freq is None and any(x is None for x in [periods, start, end]):
raise ValueError('Must provide freq argument if no data is '
'supplied')

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved these checks here from inside __new__

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, as long as we're OK with ignoring gibberish arguments when there not actually used (which seems to be what we do right now).

In [9]: idx = pd.PeriodIndex(['2000', '2001'], freq='D')

In [10]: pd.PeriodIndex(idx, start='foo')
Out[10]: PeriodIndex(['2000-01-01', '2001-01-01'], dtype='period[D]', freq='D')

if com.count_not_none(start, end, periods, freq) != 3:
raise ValueError('Of the four parameters: start, end, periods, '
'and freq, exactly three must be specified')
Expand Down Expand Up @@ -264,7 +270,7 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
if freq is not None:
if cls._use_cached_range(freq, _normalized, start, end):
# Currently always False; never hit
# Should be reimplemented as apart of GH 17914
# Should be reimplemented as a part of GH#17914
index = cls._cached_range(start, end, periods=periods,
freq=freq)
else:
Expand Down
15 changes: 11 additions & 4 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from pandas.tseries import frequencies
from pandas.tseries.offsets import Tick, DateOffset

from pandas.core.arrays import datetimelike as dtl
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin


Expand Down Expand Up @@ -132,7 +133,7 @@ def __new__(cls, values, freq=None, **kwargs):
# TODO: what if it has tz?
values = dt64arr_to_periodarr(values, freq)

return cls._simple_new(values, freq, **kwargs)
return cls._simple_new(values, freq=freq, **kwargs)

@classmethod
def _simple_new(cls, values, freq=None, **kwargs):
Expand All @@ -141,21 +142,27 @@ def _simple_new(cls, values, freq=None, **kwargs):
Ordinals in an ndarray are fastpath-ed to `_from_ordinals`
"""

if is_period_dtype(values):
freq = dtl.validate_dtype_freq(values.dtype, freq)
values = values.asi8

if not is_integer_dtype(values):
values = np.array(values, copy=False)
if len(values) > 0 and is_float_dtype(values):
raise TypeError("{cls} can't take floats"
.format(cls=cls.__name__))
return cls(values, freq=freq)
return cls(values, freq=freq, **kwargs)

return cls._from_ordinals(values, freq)
return cls._from_ordinals(values, freq=freq, **kwargs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this causing bugs that kwargs was not passed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is necessary in order to allow PeriodIndex to inherit _simple_new from PeriodArrayMixin.

The PeriodIndex constructors in particular (but also DatetimeIndex and TimedeltaIndex) are really messy and complicated. Inheriting methods and deleting as much as possible makes it easier for me to simplify them, even if some of that will end up getting copy/pasted down the road if/when inheritance is replaced.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you then move the implementation and share it by calling the other class instead of inheriting?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean moving the _from_ordinals implementation? We need both because the Index version sets self.name and calls _reset_identity

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant in PeriodIndex._simple_new calling PeriodArrayMixin._simple_new, similar as you did in one of the previous PRs (didn't look at the code, so don't know if that is possible in this case)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I get what you're saying, will give it a shot

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did anything come of this, one way or the other?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. In an earlier commit I deleted PeriodIndex._simple_new and retained PeriodIndex._from_ordinals. Now that has been reversed, and its non-trivially nicer.


@classmethod
def _from_ordinals(cls, values, freq=None):
def _from_ordinals(cls, values, freq=None, **kwargs):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally I'd like to get rid of _from_ordinals and have _simple_new be the lowest-level constructor like with all the other classes. This turns out to be infeasible ATM because a bunch of parent class methods operate on .values instead of ._ndarray_values and end up passing object-dtype to _shallow_copy. This is being split off of a branch that is trying to avoid that.

"""
Values should be int ordinals
`__new__` & `_simple_new` cooerce to ordinals and call this method
"""
# **kwargs are included so that the signature matches PeriodIndex,
# letting us share _simple_new
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you want to share this? Once we actually don't inherit it anymore, we will need to add it back to PeriodIndex

Copy link
Contributor

@TomAugspurger TomAugspurger Oct 12, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the signature should match across index classes, and hopefully the implementation can be shared.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this one is about _from_ordinals, which will not be shared across index classes as it is Period-specific

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're moving towards getting rid of _from_ordinals in order to make PeriodIndex work like all the others. To do that, we need to simplify _simple_new, which means fixing the places where inappropriate values get passed to it, i.e. #23095.


values = np.array(values, dtype='int64', copy=False)

Expand Down
15 changes: 6 additions & 9 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,6 @@ def __new__(cls, values, freq=None, start=None, end=None, periods=None,

freq, freq_infer = dtl.maybe_infer_freq(freq)

if values is None:
# TODO: Remove this block and associated kwargs; GH#20535
if freq is None and com._any_none(periods, start, end):
raise ValueError('Must provide freq argument if no data is '
'supplied')
periods = dtl.validate_periods(periods)
return cls._generate_range(start, end, periods, freq,
closed=closed)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still present in the TimedeltaIndex constructor, but its not too late to get it out of the TimedeltaArray constructor.

result = cls._simple_new(values, freq=freq)
if freq_infer:
inferred = result.inferred_freq
Expand All @@ -151,6 +142,12 @@ def __new__(cls, values, freq=None, start=None, end=None, periods=None,
@classmethod
def _generate_range(cls, start, end, periods, freq, closed=None, **kwargs):
# **kwargs are for compat with TimedeltaIndex, which includes `name`

periods = dtl.validate_periods(periods)
if freq is None and any(x is None for x in [periods, start, end]):
raise ValueError('Must provide freq argument if no data is '
'supplied')

if com.count_not_none(start, end, periods, freq) != 3:
raise ValueError('Of the four parameters: start, end, periods, '
'and freq, exactly three must be specified')
Expand Down
4 changes: 0 additions & 4 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,6 @@ def __new__(cls, data=None,

if data is None:
# TODO: Remove this block and associated kwargs; GH#20535
if freq is None and com._any_none(periods, start, end):
raise ValueError('Must provide freq argument if no data is '
'supplied')
periods = dtl.validate_periods(periods)
return cls._generate_range(start, end, periods, name, freq,
tz=tz, normalize=normalize,
closed=closed, ambiguous=ambiguous)
Expand Down
28 changes: 3 additions & 25 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
if name is None and hasattr(data, 'name'):
name = data.name

if dtype is not None:
dtype = pandas_dtype(dtype)
if not is_period_dtype(dtype):
raise ValueError('dtype must be PeriodDtype')
if freq is None:
freq = dtype.freq
elif freq != dtype.freq:
msg = 'specified freq and dtype are different'
raise IncompatibleFrequency(msg)
freq = dtl.validate_dtype_freq(dtype, freq)

# coerce freq to freq object, otherwise it can be coerced elementwise
# which is slow
Expand Down Expand Up @@ -218,7 +210,7 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
# not array / index
if not isinstance(data, (np.ndarray, PeriodIndex,
DatetimeIndex, Int64Index)):
if is_scalar(data) or isinstance(data, Period):
if is_scalar(data):
cls._scalar_data_error(data)

# other iterable of some kind
Expand Down Expand Up @@ -248,21 +240,7 @@ def _engine(self):
return self._engine_type(lambda: self, len(self))

@classmethod
def _simple_new(cls, values, name=None, freq=None, **kwargs):
"""
Values can be any type that can be coerced to Periods.
Ordinals in an ndarray are fastpath-ed to `_from_ordinals`
"""
if not is_integer_dtype(values):
values = np.array(values, copy=False)
if len(values) > 0 and is_float_dtype(values):
raise TypeError("PeriodIndex can't take floats")
return cls(values, name=name, freq=freq, **kwargs)

return cls._from_ordinals(values, name, freq, **kwargs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If keeping this now, I think it will be clearer what the changes are once we actually split index/array

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it matters too much either way. @jbrockmendel does reverting this change cause issues with master? Or does the "old" PeriodIndex._simple_new do the right thing, at the cost of code duplication? My PeriodArray PR is going to change this again anyway

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has changed since joris's comment above, but this is just a matter of code duplication.

My PeriodArray PR is going to change this again anyway

I'm kind of hoping I can simplify these a good deal further before you finish up with SparseEA


@classmethod
def _from_ordinals(cls, values, name=None, freq=None, **kwargs):
def _from_ordinals(cls, values, freq=None, name=None):
"""
Values should be int ordinals
`__new__` & `_simple_new` cooerce to ordinals and call this method
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/indexes/period/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,20 +264,20 @@ def test_constructor_mixed(self):

def test_constructor_simple_new(self):
idx = period_range('2007-01', name='p', periods=2, freq='M')
result = idx._simple_new(idx, 'p', freq=idx.freq)
result = idx._simple_new(idx, name='p', freq=idx.freq)
tm.assert_index_equal(result, idx)

result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq)
result = idx._simple_new(idx.astype('i8'), name='p', freq=idx.freq)
tm.assert_index_equal(result, idx)

result = idx._simple_new([pd.Period('2007-01', freq='M'),
pd.Period('2007-02', freq='M')],
'p', freq=idx.freq)
name='p', freq=idx.freq)
tm.assert_index_equal(result, idx)

result = idx._simple_new(np.array([pd.Period('2007-01', freq='M'),
pd.Period('2007-02', freq='M')]),
'p', freq=idx.freq)
name='p', freq=idx.freq)
tm.assert_index_equal(result, idx)

def test_constructor_simple_new_empty(self):
Expand Down