-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: Simplify Datetimelike constructor dispatching #23140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
f13cc58
4188ec7
7804f1b
a4775f4
8ee34fa
78943c1
aa71383
eae8389
e871733
7840f91
ec50b0b
eb7a6b6
32c6391
c903917
b97ec96
11db555
147de57
7c4d281
b90f421
dc4f474
46d5e64
b5827c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
from pandas._libs.tslibs.period import ( | ||
Period, DIFFERENT_FREQ_INDEX, IncompatibleFrequency) | ||
|
||
from pandas.util._decorators import deprecate_kwarg | ||
from pandas.errors import NullFrequencyError, PerformanceWarning | ||
from pandas import compat | ||
|
||
|
@@ -39,7 +40,6 @@ | |
from pandas.core.algorithms import checked_add_with_arr | ||
|
||
from .base import ExtensionOpsMixin | ||
from pandas.util._decorators import deprecate_kwarg | ||
|
||
|
||
def _make_comparison_op(cls, op): | ||
|
@@ -143,6 +143,10 @@ def asi8(self): | |
# ------------------------------------------------------------------ | ||
# Array-like Methods | ||
|
||
@property | ||
def ndim(self): | ||
return len(self.shape) | ||
|
||
@property | ||
def shape(self): | ||
return (len(self),) | ||
|
@@ -151,6 +155,10 @@ def shape(self): | |
def size(self): | ||
return np.prod(self.shape) | ||
|
||
@property | ||
def nbytes(self): | ||
return self._ndarray_values.nbytes | ||
|
||
def __len__(self): | ||
return len(self._data) | ||
|
||
|
@@ -211,6 +219,10 @@ def astype(self, dtype, copy=True): | |
# ------------------------------------------------------------------ | ||
# Null Handling | ||
|
||
def isna(self): | ||
# EA Interface | ||
return self._isnan | ||
|
||
@property # NB: override with cache_readonly in immutable subclasses | ||
def _isnan(self): | ||
""" return if each value is nan""" | ||
|
@@ -332,6 +344,10 @@ def _validate_frequency(cls, index, freq, **kwargs): | |
# Frequency validation is not meaningful for Period Array/Index | ||
return None | ||
|
||
# DatetimeArray may pass `ambiguous`, nothing else will be accepted | ||
# by cls._generate_range below | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why wouldn’t u just pop the kwarg for key and pass it directly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm actually that ends up being appreciably more verbose. We have to do separate cls._generate_range calls for TimedeltaArray vs DatetimeArray |
||
assert all(key == 'ambiguous' for key in kwargs) | ||
|
||
inferred = index.inferred_freq | ||
if index.size == 0 or inferred == freq.freqstr: | ||
return None | ||
|
@@ -590,9 +606,12 @@ def _time_shift(self, periods, freq=None): | |
|
||
start = self[0] + periods * self.freq | ||
end = self[-1] + periods * self.freq | ||
attribs = self._get_attributes_dict() | ||
|
||
# Note: in the DatetimeTZ case, _generate_range will infer the | ||
# appropriate timezone from `start` and `end`, so tz does not need | ||
# to be passed explicitly. | ||
return self._generate_range(start=start, end=end, periods=None, | ||
**attribs) | ||
freq=self.freq) | ||
|
||
@classmethod | ||
def _add_datetimelike_methods(cls): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
|
||
from pandas.core.dtypes.common import ( | ||
is_integer_dtype, is_float_dtype, is_period_dtype, | ||
is_object_dtype, | ||
is_datetime64_dtype) | ||
from pandas.core.dtypes.dtypes import PeriodDtype | ||
from pandas.core.dtypes.generic import ABCSeries | ||
|
@@ -121,18 +122,30 @@ def freq(self, value): | |
|
||
_attributes = ["freq"] | ||
|
||
def __new__(cls, values, freq=None, **kwargs): | ||
def __new__(cls, values, freq=None, dtype=None, **kwargs): | ||
|
||
if freq is not None: | ||
# coerce freq to freq object, otherwise it can be coerced | ||
# elementwise, which is slow | ||
freq = Period._maybe_convert_freq(freq) | ||
|
||
freq = dtl.validate_dtype_freq(dtype, freq) | ||
|
||
if is_period_dtype(values): | ||
# PeriodArray, PeriodIndex | ||
if freq is not None and values.freq != freq: | ||
raise IncompatibleFrequency(freq, values.freq) | ||
freq = values.freq | ||
freq = dtl.validate_dtype_freq(values.dtype, freq) | ||
values = values.asi8 | ||
|
||
elif is_datetime64_dtype(values): | ||
# TODO: what if it has tz? | ||
values = dt64arr_to_periodarr(values, freq) | ||
|
||
elif is_object_dtype(values) or isinstance(values, (list, tuple)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this be is_list_like? (for the isinstance check) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is specifically for object dtype (actually, I need to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. specifically what happens if other non ndarray list likes hit this path? do they need handling? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They do need handling, but we're not there yet. The thought process for implementing these constructors piece-by-piece is a) The DatetimeIndex/TimedeltaIndex/PeriodIndex constructors are overgrown; let's avoid that in the Array subclasses. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Other question: where was this handled previously? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's hard for me to say what's better in the abstract. From the WIP PeriodArray PR, I found that having to think carefully about what type of data I had forced some clarity in the code. I liked having to explicitly reach for that Regardless, I think our two goals with the array constructors should be
If you think we're likely to end up in a situation where being able to pass an array of objects to the main There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i am a bit puzzled why you would handle lists and and ndarray differently (tom and joris); these are clearly doing the same thing and we have a very similar handling for list likes throughout pandas separating these is a non starter - even having a separate constructor is also not very friendly. pandas does inference on the construction which is one of the big selling points. trying to change this, esp at the micro level is a huge mental disconnect. if you want to propose something like that pls do it in other issues. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think we are. But, my only argument was
If that's not persuasive then I'm not going to argue against handling them in the init. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
+1
+1
+1
Yes, I think we should be pretty forgiving about what gets accepted into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's not about lists vs arrays, it's about arrays of Period objects vs arrays of ordinal integers, which is something very different.
Being forgiving is exactly what lead to the complex Period/DatetimeIndex constructors. I think we should not make the same choice for our Array classes. I personally also think it makes the code clearer to even separate those two concepts (basically what we also did with IntegerArray), but maybe let's open an issue to further discuss that instead of here in a hidden review comment thread? (i can only open one later today ) |
||
# e.g. array([Period(...), Period(...), NaT]) | ||
values = np.array(values, dtype=object) | ||
if freq is None: | ||
freq = libperiod.extract_freq(values) | ||
values = libperiod.extract_ordinals(values, freq) | ||
|
||
return cls._simple_new(values, freq=freq, **kwargs) | ||
|
||
@classmethod | ||
|
@@ -175,11 +188,13 @@ def _from_ordinals(cls, values, freq=None, **kwargs): | |
|
||
@classmethod | ||
def _generate_range(cls, start, end, periods, freq, fields): | ||
periods = dtl.validate_periods(periods) | ||
|
||
if freq is not None: | ||
freq = Period._maybe_convert_freq(freq) | ||
|
||
field_count = len(fields) | ||
if com.count_not_none(start, end) > 0: | ||
if start is not None or end is not None: | ||
if field_count > 0: | ||
raise ValueError('Can either instantiate from fields ' | ||
'or endpoints, but not both') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -430,6 +430,7 @@ def min(self, axis=None, *args, **kwargs): | |
-------- | ||
numpy.ndarray.min | ||
""" | ||
_validate_minmax_axis(axis) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not what i mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, done. |
||
nv.validate_min(args, kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there reason not to add the axis validation to the existing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. exactly I don't want another function, rather you can simply check this in side the function which is already there. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. I'm not wild about the fact that the nv.validate_(min|max|argmin|argmax) functions now implicitly assume they are only being called on 1-dim objects, but at least the assumption is correct for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, yeah, that makes sense. |
||
|
||
try: | ||
|
@@ -458,6 +459,7 @@ def argmin(self, axis=None, *args, **kwargs): | |
-------- | ||
numpy.ndarray.argmin | ||
""" | ||
_validate_minmax_axis(axis) | ||
nv.validate_argmin(args, kwargs) | ||
|
||
i8 = self.asi8 | ||
|
@@ -478,6 +480,7 @@ def max(self, axis=None, *args, **kwargs): | |
-------- | ||
numpy.ndarray.max | ||
""" | ||
_validate_minmax_axis(axis) | ||
nv.validate_max(args, kwargs) | ||
|
||
try: | ||
|
@@ -506,6 +509,7 @@ def argmax(self, axis=None, *args, **kwargs): | |
-------- | ||
numpy.ndarray.argmax | ||
""" | ||
_validate_minmax_axis(axis) | ||
nv.validate_argmax(args, kwargs) | ||
|
||
i8 = self.asi8 | ||
|
@@ -699,6 +703,31 @@ def astype(self, dtype, copy=True): | |
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) | ||
return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy) | ||
|
||
@Appender(DatetimeLikeArrayMixin._time_shift.__doc__) | ||
def _time_shift(self, periods, freq=None): | ||
result = DatetimeLikeArrayMixin._time_shift(self, periods, freq=freq) | ||
result.name = self.name | ||
return result | ||
|
||
|
||
def _validate_minmax_axis(axis): | ||
""" | ||
Ensure that the axis argument passed to min, max, argmin, or argmax is | ||
zero or None, as otherwise it will be incorrectly ignored. | ||
|
||
Parameters | ||
---------- | ||
axis : int or None | ||
|
||
Raises | ||
------ | ||
ValueError | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see my comment above |
||
ndim = 1 # hard-coded for Index | ||
if axis is not None and axis >= ndim: | ||
raise ValueError("`axis` must be fewer than the number of " | ||
"dimensions ({ndim})".format(ndim=ndim)) | ||
|
||
|
||
def _ensure_datetimelike_to_i8(other, to_utc=False): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -241,9 +241,11 @@ def __new__(cls, data=None, | |
|
||
if data is None: | ||
# TODO: Remove this block and associated kwargs; GH#20535 | ||
return cls._generate_range(start, end, periods, name, freq, | ||
tz=tz, normalize=normalize, | ||
closed=closed, ambiguous=ambiguous) | ||
result = cls._generate_range(start, end, periods, | ||
freq=freq, tz=tz, normalize=normalize, | ||
closed=closed, ambiguous=ambiguous) | ||
result.name = name | ||
return result | ||
|
||
if not isinstance(data, (np.ndarray, Index, ABCSeries, | ||
DatetimeArrayMixin)): | ||
|
@@ -315,17 +317,6 @@ def __new__(cls, data=None, | |
|
||
return subarr._deepcopy_if_needed(ref_to_data, copy) | ||
|
||
@classmethod | ||
@Appender(DatetimeArrayMixin._generate_range.__doc__) | ||
def _generate_range(cls, start, end, periods, name=None, freq=None, | ||
tz=None, normalize=False, ambiguous='raise', | ||
closed=None): | ||
out = super(DatetimeIndex, cls)._generate_range( | ||
start, end, periods, freq, | ||
tz=tz, normalize=normalize, ambiguous=ambiguous, closed=closed) | ||
out.name = name | ||
return out | ||
|
||
@classmethod | ||
def _use_cached_range(cls, freq, _normalized, start, end): | ||
# Note: This always returns False | ||
|
@@ -389,27 +380,6 @@ def tz(self, value): | |
raise AttributeError("Cannot directly set timezone. Use tz_localize() " | ||
"or tz_convert() as appropriate") | ||
|
||
@property | ||
def size(self): | ||
# TODO: Remove this when we have a DatetimeTZArray | ||
# Necessary to avoid recursion error since DTI._values is a DTI | ||
# for TZ-aware | ||
return self._ndarray_values.size | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you removing those? Those will need to be added back once we do the actual index/array split anyway, as they will be calling in the underlying array? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Because I am OK with needing to add them back in a few days (hopefully) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But can you then try to explain me what the advantage is of moving it now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
As long as one of the index classes is still inheriting from the ArrayMixin, there will be wrong / strange mixups, that need to be cleaned up
But how would you do that if the underlying values don't yet have those attributes, because it is not yet our internal array class? And why not move them when implementing such a decorator? Then you actually have overview of the full changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You have sufficiently frustrated me into reverting this so we can move this down the field. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche if you're still up, can you take a look at the newest push and verify that the parts you have a problem with have been removed? |
||
|
||
@property | ||
def shape(self): | ||
# TODO: Remove this when we have a DatetimeTZArray | ||
# Necessary to avoid recursion error since DTI._values is a DTI | ||
# for TZ-aware | ||
return self._ndarray_values.shape | ||
|
||
@property | ||
def nbytes(self): | ||
# TODO: Remove this when we have a DatetimeTZArray | ||
# Necessary to avoid recursion error since DTI._values is a DTI | ||
# for TZ-aware | ||
return self._ndarray_values.nbytes | ||
|
||
@classmethod | ||
def _cached_range(cls, start=None, end=None, periods=None, freq=None, | ||
name=None): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it needed to have the
_isnan
concept on the arrays? We use it in some internal methods on the Index class, but for Arrays it seems to me additional complexity compared to simply definingisna
appropriately on each Array ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Discussed elsewhere; can we mark as resolved?