Skip to content

REF: Simplify Datetimelike constructor dispatching #23140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f13cc58
Avoid non-public constructors
jbrockmendel Oct 13, 2018
4188ec7
simplify and de-duplicate _generate_range
jbrockmendel Oct 13, 2018
7804f1b
Check for invalid axis kwarg
jbrockmendel Oct 13, 2018
a4775f4
Move some EA properties up to mixins
jbrockmendel Oct 13, 2018
8ee34fa
implement basic TimedeltaArray tests
jbrockmendel Oct 13, 2018
78943c1
clean up PeriodArray constructor, with tests
jbrockmendel Oct 13, 2018
aa71383
make PeriodArray.__new__ more grown-up
jbrockmendel Oct 13, 2018
eae8389
Remove unused kwargs from TimedeltaArray.__new__
jbrockmendel Oct 13, 2018
e871733
revert change that broke tests
jbrockmendel Oct 13, 2018
7840f91
Fixup whitespace
jbrockmendel Oct 14, 2018
ec50b0b
helper function for axis validation
jbrockmendel Oct 14, 2018
eb7a6b6
suggested clarifications
jbrockmendel Oct 14, 2018
32c6391
Merge branch 'dlike8' of https://github.com/jbrockmendel/pandas into …
jbrockmendel Oct 14, 2018
c903917
Merge branch 'master' of https://github.com/pandas-dev/pandas into dl…
jbrockmendel Oct 14, 2018
b97ec96
move axis validation to nv
jbrockmendel Oct 14, 2018
11db555
Merge branch 'master' of https://github.com/pandas-dev/pandas into dl…
jbrockmendel Oct 14, 2018
147de57
revert some removals
jbrockmendel Oct 14, 2018
7c4d281
Merge branch 'master' of https://github.com/pandas-dev/pandas into dl…
jbrockmendel Oct 15, 2018
b90f421
catch too-negative values
jbrockmendel Oct 15, 2018
dc4f474
Roll validate_minmax_axis into existing validate functions
jbrockmendel Oct 15, 2018
46d5e64
fixup typo
jbrockmendel Oct 15, 2018
b5827c7
Merge branch 'master' of https://github.com/pandas-dev/pandas into dl…
jbrockmendel Oct 17, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 49 additions & 8 deletions pandas/compat/numpy/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,30 @@ def __call__(self, args, kwargs, fname=None,


ARGMINMAX_DEFAULTS = dict(out=None)
validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin',
method='both', max_fname_arg_count=1)
validate_argmax = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmax',
method='both', max_fname_arg_count=1)
_validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin',
method='both', max_fname_arg_count=1)
_validate_argmax = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmax',
method='both', max_fname_arg_count=1)


def validate_argmin(args, kwargs, axis=None):
_validate_argmin(args, kwargs)
validate_minmax_axis(axis)


def validate_argmax(args, kwargs, axis=None):
_validate_argmax(args, kwargs)
validate_minmax_axis(axis)


def validate_min(args, kwargs, axis=None):
_validate_min(args, kwargs)
validate_minmax_axis(axis)


def validate_max(args, kwargs, axis=None):
_validate_max(args, kwargs)
validate_minmax_axis(axis)


def process_skipna(skipna, args):
Expand Down Expand Up @@ -196,10 +216,10 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs')

MINMAX_DEFAULTS = dict(out=None)
validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min',
method='both', max_fname_arg_count=1)
validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max',
method='both', max_fname_arg_count=1)
_validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min',
method='both', max_fname_arg_count=1)
_validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max',
method='both', max_fname_arg_count=1)

RESHAPE_DEFAULTS = dict(order='C')
validate_reshape = CompatValidator(RESHAPE_DEFAULTS, fname='reshape',
Expand Down Expand Up @@ -360,3 +380,24 @@ def validate_resampler_func(method, args, kwargs):
"{func}() instead".format(func=method)))
else:
raise TypeError("too many arguments passed in")


def validate_minmax_axis(axis):
"""
Ensure that the axis argument passed to min, max, argmin, or argmax is
zero or None, as otherwise it will be incorrectly ignored.

Parameters
----------
axis : int or None

Raises
------
ValueError
"""
ndim = 1 # hard-coded for Index
if axis is None:
return
if axis >= ndim or (axis < 0 and ndim + axis < 0):
raise ValueError("`axis` must be fewer than the number of "
"dimensions ({ndim})".format(ndim=ndim))
25 changes: 22 additions & 3 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas._libs.tslibs.period import (
Period, DIFFERENT_FREQ_INDEX, IncompatibleFrequency)

from pandas.util._decorators import deprecate_kwarg
from pandas.errors import NullFrequencyError, PerformanceWarning
from pandas import compat

Expand Down Expand Up @@ -39,7 +40,6 @@
from pandas.core.algorithms import checked_add_with_arr

from .base import ExtensionOpsMixin
from pandas.util._decorators import deprecate_kwarg


def _make_comparison_op(cls, op):
Expand Down Expand Up @@ -143,6 +143,10 @@ def asi8(self):
# ------------------------------------------------------------------
# Array-like Methods

@property
def ndim(self):
return len(self.shape)

@property
def shape(self):
return (len(self),)
Expand All @@ -151,6 +155,10 @@ def shape(self):
def size(self):
return np.prod(self.shape)

@property
def nbytes(self):
return self._ndarray_values.nbytes

def __len__(self):
return len(self._data)

Expand Down Expand Up @@ -211,6 +219,10 @@ def astype(self, dtype, copy=True):
# ------------------------------------------------------------------
# Null Handling

def isna(self):
# EA Interface
return self._isnan
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it needed to have the _isnan concept on the arrays? We use it in some internal methods on the Index class, but for Arrays it seems to me additional complexity compared to simply defining isna appropriately on each Array ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed elsewhere; can we mark as resolved?


@property # NB: override with cache_readonly in immutable subclasses
def _isnan(self):
""" return if each value is nan"""
Expand Down Expand Up @@ -332,6 +344,10 @@ def _validate_frequency(cls, index, freq, **kwargs):
# Frequency validation is not meaningful for Period Array/Index
return None

# DatetimeArray may pass `ambiguous`, nothing else will be accepted
# by cls._generate_range below
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why wouldn’t u just pop the kwarg for key and pass it directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm actually that ends up being appreciably more verbose. We have to do separate cls._generate_range calls for TimedeltaArray vs DatetimeArray

assert all(key == 'ambiguous' for key in kwargs)

inferred = index.inferred_freq
if index.size == 0 or inferred == freq.freqstr:
return None
Expand Down Expand Up @@ -595,9 +611,12 @@ def _time_shift(self, periods, freq=None):

start = self[0] + periods * self.freq
end = self[-1] + periods * self.freq
attribs = self._get_attributes_dict()

# Note: in the DatetimeTZ case, _generate_range will infer the
# appropriate timezone from `start` and `end`, so tz does not need
# to be passed explicitly.
return self._generate_range(start=start, end=end, periods=None,
**attribs)
freq=self.freq)

@classmethod
def _add_datetimelike_methods(cls):
Expand Down
27 changes: 21 additions & 6 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pandas.core.dtypes.common import (
is_integer_dtype, is_float_dtype, is_period_dtype, is_timedelta64_dtype,
is_object_dtype,
is_datetime64_dtype, _TD_DTYPE)
from pandas.core.dtypes.dtypes import PeriodDtype
from pandas.core.dtypes.generic import ABCSeries
Expand Down Expand Up @@ -122,18 +123,30 @@ def freq(self, value):

_attributes = ["freq"]

def __new__(cls, values, freq=None, **kwargs):
def __new__(cls, values, freq=None, dtype=None, **kwargs):

if freq is not None:
# coerce freq to freq object, otherwise it can be coerced
# elementwise, which is slow
freq = Period._maybe_convert_freq(freq)

freq = dtl.validate_dtype_freq(dtype, freq)

if is_period_dtype(values):
# PeriodArray, PeriodIndex
if freq is not None and values.freq != freq:
raise IncompatibleFrequency(freq, values.freq)
freq = values.freq
freq = dtl.validate_dtype_freq(values.dtype, freq)
values = values.asi8

elif is_datetime64_dtype(values):
# TODO: what if it has tz?
values = dt64arr_to_periodarr(values, freq)

elif is_object_dtype(values) or isinstance(values, (list, tuple)):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this be is_list_like? (for the isinstance check)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is specifically for object dtype (actually, I need to add dtype=object to the np.array call below) since we're calling libperiod.extract_ordinals, which expects object dtype.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

specifically what happens if other non ndarray list likes hit this path? do they need handling?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They do need handling, but we're not there yet. The thought process for implementing these constructors piece-by-piece is

a) The DatetimeIndex/TimedeltaIndex/PeriodIndex constructors are overgrown; let's avoid that in the Array subclasses.
b) Avoid letting the implementations get too far ahead of the tests

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other question: where was this handled previously?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's hard for me to say what's better in the abstract.

From the WIP PeriodArray PR, I found that having to think carefully about what type of data I had forced some clarity in the code. I liked having to explicitly reach for that _from_periods constructor.

Regardless, I think our two goals with the array constructors should be

  1. Maximizing developer happiness (i.e. not users at the moment)
  2. Making it easier to reuse code between Index & Array subclasses

If you think we're likely to end up in a situation where being able to pass an array of objects to the main __init__ will make things easier, then by all means.

Copy link
Contributor

@jreback jreback Oct 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i am a bit puzzled why you would handle lists and and ndarray differently (tom and joris); these are clearly doing the same thing and we have a very similar handling for list likes throughout pandas

separating these is a non starter - even having a separate constructor is also not very friendly. pandas does inference on the construction which is one of the big selling points. trying to change this, esp at the micro level is a huge mental disconnect.

if you want to propose something like that pls do it in other issues.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i am a bit puzzled why you would handle lists and and ndarray differently (tom and joris)

I don't think we are.

But, my only argument was

From the WIP PeriodArray PR, I found that having to think carefully about what type of data I had forced some clarity in the code. I liked having to explicitly reach for that _from_periods constructor.

If that's not persuasive then I'm not going to argue against handling them in the init.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having to think carefully

+1

Maximizing developer happiness

+1

Making it easier to reuse code

+1

If you think we're likely to end up in a situation where being able to pass an array of objects to the main

Yes, I think we should be pretty forgiving about what gets accepted into __init__ (for all three of Period/Datetime/Timedelta Arrays). Definitely don't want the start, end, periods currently in the Index subclass constructors. I think by excluding those we'll keep these constructors fairly straightforward.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i am a bit puzzled why you would handle lists and and ndarray differently

It's not about lists vs arrays, it's about arrays of Period objects vs arrays of ordinal integers, which is something very different.

I think we should be pretty forgiving about what gets accepted into init

Being forgiving is exactly what lead to the complex Period/DatetimeIndex constructors. I think we should not make the same choice for our Array classes.
Of course it doesn't need to be that complex, as I think there are here two main usecases discussed: an array of scalar objects (eg Periods or Timestamps), or an array of the underlying storage type (eg datetime64 or ordinal integers).

I personally also think it makes the code clearer to even separate those two concepts (basically what we also did with IntegerArray), but maybe let's open an issue to further discuss that instead of here in a hidden review comment thread? (i can only open one later today )

# e.g. array([Period(...), Period(...), NaT])
values = np.array(values, dtype=object)
if freq is None:
freq = libperiod.extract_freq(values)
values = libperiod.extract_ordinals(values, freq)

return cls._simple_new(values, freq=freq, **kwargs)

@classmethod
Expand Down Expand Up @@ -176,11 +189,13 @@ def _from_ordinals(cls, values, freq=None, **kwargs):

@classmethod
def _generate_range(cls, start, end, periods, freq, fields):
periods = dtl.validate_periods(periods)

if freq is not None:
freq = Period._maybe_convert_freq(freq)

field_count = len(fields)
if com.count_not_none(start, end) > 0:
if start is not None or end is not None:
if field_count > 0:
raise ValueError('Can either instantiate from fields '
'or endpoints, but not both')
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ def _simple_new(cls, values, freq=None, **kwargs):
result._freq = freq
return result

def __new__(cls, values, freq=None, start=None, end=None, periods=None,
closed=None):
def __new__(cls, values, freq=None):

freq, freq_infer = dtl.maybe_infer_freq(freq)

Expand All @@ -140,8 +139,7 @@ def __new__(cls, values, freq=None, start=None, end=None, periods=None,
return result

@classmethod
def _generate_range(cls, start, end, periods, freq, closed=None, **kwargs):
# **kwargs are for compat with TimedeltaIndex, which includes `name`
def _generate_range(cls, start, end, periods, freq, closed=None):

periods = dtl.validate_periods(periods)
if freq is None and any(x is None for x in [periods, start, end]):
Expand All @@ -167,10 +165,9 @@ def _generate_range(cls, start, end, periods, freq, closed=None, **kwargs):

if freq is not None:
index = _generate_regular_range(start, end, periods, freq)
index = cls._simple_new(index, freq=freq, **kwargs)
index = cls._simple_new(index, freq=freq)
else:
index = np.linspace(start.value, end.value, periods).astype('i8')
# TODO: shouldn't we pass `name` here? (via **kwargs)
index = cls._simple_new(index, freq=freq)

if not left_closed:
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def min(self, axis=None, *args, **kwargs):
--------
numpy.ndarray.min
"""
nv.validate_min(args, kwargs)
nv.validate_min(args, kwargs, axis=axis)

try:
i8 = self.asi8
Expand Down Expand Up @@ -458,7 +458,7 @@ def argmin(self, axis=None, *args, **kwargs):
--------
numpy.ndarray.argmin
"""
nv.validate_argmin(args, kwargs)
nv.validate_argmin(args, kwargs, axis=axis)

i8 = self.asi8
if self.hasnans:
Expand All @@ -478,7 +478,7 @@ def max(self, axis=None, *args, **kwargs):
--------
numpy.ndarray.max
"""
nv.validate_max(args, kwargs)
nv.validate_max(args, kwargs, axis=axis)

try:
i8 = self.asi8
Expand Down Expand Up @@ -506,7 +506,7 @@ def argmax(self, axis=None, *args, **kwargs):
--------
numpy.ndarray.argmax
"""
nv.validate_argmax(args, kwargs)
nv.validate_argmax(args, kwargs, axis=axis)

i8 = self.asi8
if self.hasnans:
Expand Down Expand Up @@ -699,6 +699,12 @@ def astype(self, dtype, copy=True):
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)

@Appender(DatetimeLikeArrayMixin._time_shift.__doc__)
def _time_shift(self, periods, freq=None):
result = DatetimeLikeArrayMixin._time_shift(self, periods, freq=freq)
result.name = self.name
return result


def _ensure_datetimelike_to_i8(other, to_utc=False):
"""
Expand Down
19 changes: 5 additions & 14 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,9 +241,11 @@ def __new__(cls, data=None,

if data is None:
# TODO: Remove this block and associated kwargs; GH#20535
return cls._generate_range(start, end, periods, name, freq,
tz=tz, normalize=normalize,
closed=closed, ambiguous=ambiguous)
result = cls._generate_range(start, end, periods,
freq=freq, tz=tz, normalize=normalize,
closed=closed, ambiguous=ambiguous)
result.name = name
return result

if not isinstance(data, (np.ndarray, Index, ABCSeries,
DatetimeArrayMixin)):
Expand Down Expand Up @@ -315,17 +317,6 @@ def __new__(cls, data=None,

return subarr._deepcopy_if_needed(ref_to_data, copy)

@classmethod
@Appender(DatetimeArrayMixin._generate_range.__doc__)
def _generate_range(cls, start, end, periods, name=None, freq=None,
tz=None, normalize=False, ambiguous='raise',
closed=None):
out = super(DatetimeIndex, cls)._generate_range(
start, end, periods, freq,
tz=tz, normalize=normalize, ambiguous=ambiguous, closed=closed)
out.name = name
return out

def _convert_for_op(self, value):
""" Convert value to be insertable to ndarray """
if self._has_same_tz(value):
Expand Down
2 changes: 0 additions & 2 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,6 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
raise TypeError('__new__() got an unexpected keyword argument {}'.
format(list(set(fields) - valid_field_set)[0]))

periods = dtl.validate_periods(periods)

if name is None and hasattr(data, 'name'):
name = data.name

Expand Down
20 changes: 4 additions & 16 deletions pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,10 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,

if data is None:
# TODO: Remove this block and associated kwargs; GH#20535
if freq is None and com._any_none(periods, start, end):
raise ValueError('Must provide freq argument if no data is '
'supplied')
periods = dtl.validate_periods(periods)
return cls._generate_range(start, end, periods, name, freq,
closed=closed)
out = cls._generate_range(start, end, periods,
freq=freq, closed=closed)
out.name = name
return out

if unit is not None:
data = to_timedelta(data, unit=unit, box=False)
Expand Down Expand Up @@ -181,16 +179,6 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,

return subarr

@classmethod
def _generate_range(cls, start, end, periods,
name=None, freq=None, closed=None):
# TimedeltaArray gets `name` via **kwargs, so we need to explicitly
# override it if name is passed as a positional argument
return super(TimedeltaIndex, cls)._generate_range(start, end,
periods, freq,
name=name,
closed=closed)

@classmethod
def _simple_new(cls, values, name=None, freq=None, **kwargs):
result = super(TimedeltaIndex, cls)._simple_new(values, freq, **kwargs)
Expand Down
3 changes: 1 addition & 2 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2476,8 +2476,7 @@ def _get_index_factory(self, klass):
if klass == DatetimeIndex:
def f(values, freq=None, tz=None):
# data are already in UTC, localize and convert if tz present
result = DatetimeIndex._simple_new(values.values, name=None,
freq=freq)
result = DatetimeIndex(values.values, name=None, freq=freq)
if tz is not None:
result = result.tz_localize('UTC').tz_convert(tz)
return result
Expand Down
Loading