Skip to content

Commit 588b558

Browse files
TomAugspurgerjreback
authored andcommitted
REF: DatetimeLikeArray (#24024)
1 parent 8088fe0 commit 588b558

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+858
-304
lines changed

doc/source/whatsnew/v0.24.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ Backwards incompatible API changes
430430
- ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`)
431431
- :func:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`)
432432
- The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`)
433+
- Incorrectly passing a :class:`DatetimeIndex` to :meth:`MultiIndex.from_tuples`, rather than a sequence of tuples, now raises a ``TypeError`` rather than a ``ValueError`` (:issue:`24024`)
433434
- :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`)
434435

435436
Percentage change on groupby
@@ -1368,6 +1369,7 @@ Datetimelike
13681369
- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`)
13691370
- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`)
13701371
- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`)
1372+
- Bug in :attr:`DataFrame.values` returning a :class:`DatetimeIndex` for a single-column ``DataFrame`` with tz-aware datetime values. Now a 2-D :class:`numpy.ndarray` of :class:`Timestamp` objects is returned (:issue:`24024`)
13711373
- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`)
13721374
- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`)
13731375
- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`)
@@ -1384,6 +1386,7 @@ Datetimelike
13841386
- Bug in :func:`period_range` ignoring the frequency of ``start`` and ``end`` when those are provided as :class:`Period` objects (:issue:`20535`).
13851387
- Bug in :class:`PeriodIndex` with attribute ``freq.n`` greater than 1 where adding a :class:`DateOffset` object would return incorrect results (:issue:`23215`)
13861388
- Bug in :class:`Series` that interpreted string indices as lists of characters when setting datetimelike values (:issue:`23451`)
1389+
- Bug in :class:`DataFrame` when creating a new column from an ndarray of :class:`Timestamp` objects with timezones creating an object-dtype column, rather than datetime with timezone (:issue:`23932`)
13871390
- Bug in :class:`Timestamp` constructor which would drop the frequency of an input :class:`Timestamp` (:issue:`22311`)
13881391
- Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`)
13891392
- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`)

pandas/_libs/src/ujson/python/objToJSON.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,11 @@ static PyObject *get_values(PyObject *obj) {
228228
PRINTMARK();
229229

230230
if (values && !PyArray_CheckExact(values)) {
231+
232+
if (PyObject_HasAttrString(values, "to_numpy")) {
233+
values = PyObject_CallMethod(values, "to_numpy", NULL);
234+
}
235+
231236
if (PyObject_HasAttrString(values, "values")) {
232237
PyObject *subvals = get_values(values);
233238
PyErr_Clear();
@@ -279,8 +284,8 @@ static PyObject *get_values(PyObject *obj) {
279284
repr = PyString_FromString("<unknown dtype>");
280285
}
281286

282-
PyErr_Format(PyExc_ValueError, "%s or %s are not JSON serializable yet",
283-
PyString_AS_STRING(repr), PyString_AS_STRING(typeRepr));
287+
PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet",
288+
repr, typeRepr);
284289
Py_DECREF(repr);
285290
Py_DECREF(typeRepr);
286291

pandas/core/arrays/datetimelike.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def cmp_method(self, other):
4747
if isinstance(other, ABCDataFrame):
4848
return NotImplemented
4949

50-
if isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)):
50+
if isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries, cls)):
5151
if other.ndim > 0 and len(self) != len(other):
5252
raise ValueError('Lengths must match to compare')
5353

@@ -1162,9 +1162,10 @@ def _addsub_offset_array(self, other, op):
11621162
left = lib.values_from_object(self.astype('O'))
11631163

11641164
res_values = op(left, np.array(other))
1165+
kwargs = {}
11651166
if not is_period_dtype(self):
1166-
return type(self)(res_values, freq='infer')
1167-
return self._from_sequence(res_values)
1167+
kwargs['freq'] = 'infer'
1168+
return self._from_sequence(res_values, **kwargs)
11681169

11691170
def _time_shift(self, periods, freq=None):
11701171
"""

pandas/core/arrays/datetimes.py

+111-36
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ def _dt_array_cmp(cls, op):
9797

9898
def wrapper(self, other):
9999
meth = getattr(dtl.DatetimeLikeArrayMixin, opname)
100+
# TODO: return NotImplemented for Series / Index and let pandas unbox
101+
# Right now, returning NotImplemented for Index fails because we
102+
# go into the index implementation, which may be a bug?
100103

101104
other = lib.item_from_zerodim(other)
102105

@@ -145,9 +148,16 @@ def wrapper(self, other):
145148
return ops.invalid_comparison(self, other, op)
146149
else:
147150
self._assert_tzawareness_compat(other)
148-
if not hasattr(other, 'asi8'):
149-
# ndarray, Series
150-
other = type(self)(other)
151+
if isinstance(other, (ABCIndexClass, ABCSeries)):
152+
other = other.array
153+
154+
if (is_datetime64_dtype(other) and
155+
not is_datetime64_ns_dtype(other) or
156+
not hasattr(other, 'asi8')):
157+
# e.g. other.dtype == 'datetime64[s]'
158+
# or an object-dtype ndarray
159+
other = type(self)._from_sequence(other)
160+
151161
result = meth(self, other)
152162
o_mask = other._isnan
153163

@@ -171,10 +181,24 @@ class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin,
171181
dtl.TimelikeOps,
172182
dtl.DatelikeOps):
173183
"""
174-
Assumes that subclass __new__/__init__ defines:
175-
tz
176-
_freq
177-
_data
184+
Pandas ExtensionArray for tz-naive or tz-aware datetime data.
185+
186+
.. versionadded:: 0.24.0
187+
188+
Parameters
189+
----------
190+
values : Series, Index, DatetimeArray, ndarray
191+
The datetime data.
192+
193+
For DatetimeArray `values` (or a Series or Index boxing one),
194+
`dtype` and `freq` will be extracted from `values`, with
195+
precedence given to
196+
197+
dtype : numpy.dtype or DatetimeTZDtype
198+
Note that the only NumPy dtype allowed is 'datetime64[ns]'.
199+
freq : str or Offset, optional
200+
copy : bool, default False
201+
Whether to copy the underlying array of values.
178202
"""
179203
_typ = "datetimearray"
180204
_scalar_type = Timestamp
@@ -213,38 +237,84 @@ class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin,
213237
_dtype = None # type: Union[np.dtype, DatetimeTZDtype]
214238
_freq = None
215239

216-
@classmethod
217-
def _simple_new(cls, values, freq=None, tz=None):
218-
"""
219-
we require the we have a dtype compat for the values
220-
if we are passed a non-dtype compat, then coerce using the constructor
221-
"""
222-
assert isinstance(values, np.ndarray), type(values)
240+
def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
241+
if isinstance(values, (ABCSeries, ABCIndexClass)):
242+
values = values._values
243+
244+
if isinstance(values, type(self)):
245+
# validation
246+
dtz = getattr(dtype, 'tz', None)
247+
if dtz and values.tz is None:
248+
dtype = DatetimeTZDtype(tz=dtype.tz)
249+
elif dtz and values.tz:
250+
if not timezones.tz_compare(dtz, values.tz):
251+
msg = (
252+
"Timezone of the array and 'dtype' do not match. "
253+
"'{}' != '{}'"
254+
)
255+
raise TypeError(msg.format(dtz, values.tz))
256+
elif values.tz:
257+
dtype = values.dtype
258+
# freq = validate_values_freq(values, freq)
259+
if freq is None:
260+
freq = values.freq
261+
values = values._data
262+
263+
if not isinstance(values, np.ndarray):
264+
msg = (
265+
"Unexpected type '{}'. 'values' must be a DatetimeArray "
266+
"ndarray, or Series or Index containing one of those."
267+
)
268+
raise ValueError(msg.format(type(values).__name__))
269+
223270
if values.dtype == 'i8':
224271
# for compat with datetime/timedelta/period shared methods,
225272
# we can sometimes get here with int64 values. These represent
226273
# nanosecond UTC (or tz-naive) unix timestamps
227274
values = values.view(_NS_DTYPE)
228275

229-
assert values.dtype == 'M8[ns]', values.dtype
276+
if values.dtype != _NS_DTYPE:
277+
msg = (
278+
"The dtype of 'values' is incorrect. Must be 'datetime64[ns]'."
279+
" Got {} instead."
280+
)
281+
raise ValueError(msg.format(values.dtype))
230282

231-
result = object.__new__(cls)
232-
result._data = values
233-
result._freq = freq
234-
if tz is None:
235-
dtype = _NS_DTYPE
236-
else:
237-
tz = timezones.maybe_get_tz(tz)
238-
tz = timezones.tz_standardize(tz)
239-
dtype = DatetimeTZDtype('ns', tz)
240-
result._dtype = dtype
241-
return result
283+
dtype = pandas_dtype(dtype)
284+
_validate_dt64_dtype(dtype)
242285

243-
def __new__(cls, values, freq=None, tz=None, dtype=None, copy=False,
244-
dayfirst=False, yearfirst=False, ambiguous='raise'):
245-
return cls._from_sequence(
246-
values, freq=freq, tz=tz, dtype=dtype, copy=copy,
247-
dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous)
286+
if freq == "infer":
287+
msg = (
288+
"Frequency inference not allowed in DatetimeArray.__init__. "
289+
"Use 'pd.array()' instead."
290+
)
291+
raise ValueError(msg)
292+
293+
if copy:
294+
values = values.copy()
295+
if freq:
296+
freq = to_offset(freq)
297+
if getattr(dtype, 'tz', None):
298+
# https://github.com/pandas-dev/pandas/issues/18595
299+
# Ensure that we have a standard timezone for pytz objects.
300+
# Without this, things like adding an array of timedeltas and
301+
# a tz-aware Timestamp (with a tz specific to its datetime) will
302+
# be incorrect(ish?) for the array as a whole
303+
dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))
304+
305+
self._data = values
306+
self._dtype = dtype
307+
self._freq = freq
308+
309+
@classmethod
310+
def _simple_new(cls, values, freq=None, tz=None):
311+
"""
312+
we require the we have a dtype compat for the values
313+
if we are passed a non-dtype compat, then coerce using the constructor
314+
"""
315+
dtype = DatetimeTZDtype(tz=tz) if tz else _NS_DTYPE
316+
317+
return cls(values, freq=freq, dtype=dtype)
248318

249319
@classmethod
250320
def _from_sequence(cls, data, dtype=None, copy=False,
@@ -459,8 +529,7 @@ def __array__(self, dtype=None):
459529
elif is_int64_dtype(dtype):
460530
return self.asi8
461531

462-
# TODO: warn that conversion may be lossy?
463-
return self._data.view(np.ndarray) # follow Index.__array__
532+
return self._data
464533

465534
def __iter__(self):
466535
"""
@@ -519,7 +588,7 @@ def astype(self, dtype, copy=True):
519588

520589
@Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__)
521590
def _validate_fill_value(self, fill_value):
522-
if isna(fill_value):
591+
if isna(fill_value) or fill_value == iNaT:
523592
fill_value = iNaT
524593
elif isinstance(fill_value, (datetime, np.datetime64)):
525594
self._assert_tzawareness_compat(fill_value)
@@ -1574,6 +1643,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
15741643
# if dtype has an embedded tz, capture it
15751644
tz = validate_tz_from_dtype(dtype, tz)
15761645

1646+
if isinstance(data, ABCIndexClass):
1647+
data = data._data
1648+
15771649
# By this point we are assured to have either a numpy array or Index
15781650
data, copy = maybe_convert_dtype(data, copy)
15791651

@@ -1590,12 +1662,15 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
15901662
data, dayfirst=dayfirst, yearfirst=yearfirst)
15911663
tz = maybe_infer_tz(tz, inferred_tz)
15921664

1665+
# `data` may have originally been a Categorical[datetime64[ns, tz]],
1666+
# so we need to handle these types.
15931667
if is_datetime64tz_dtype(data):
1668+
# DatetimeArray -> ndarray
15941669
tz = maybe_infer_tz(tz, data.tz)
15951670
result = data._data
15961671

15971672
elif is_datetime64_dtype(data):
1598-
# tz-naive DatetimeArray/Index or ndarray[datetime64]
1673+
# tz-naive DatetimeArray or ndarray[datetime64]
15991674
data = getattr(data, "_data", data)
16001675
if data.dtype != _NS_DTYPE:
16011676
data = conversion.ensure_datetime64ns(data)
@@ -1750,7 +1825,7 @@ def maybe_convert_dtype(data, copy):
17501825
# GH#18664 preserve tz in going DTI->Categorical->DTI
17511826
# TODO: cases where we need to do another pass through this func,
17521827
# e.g. the categories are timedelta64s
1753-
data = data.categories.take(data.codes, fill_value=NaT)
1828+
data = data.categories.take(data.codes, fill_value=NaT)._values
17541829
copy = False
17551830

17561831
elif is_extension_type(data) and not is_datetime64tz_dtype(data):

pandas/core/arrays/period.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,7 @@ def __init__(self, values, freq=None, dtype=None, copy=False):
179179

180180
@classmethod
181181
def _simple_new(cls, values, freq=None, **kwargs):
182-
# TODO(DatetimeArray): remove once all constructors are aligned.
183-
# alias from PeriodArray.__init__
182+
# alias for PeriodArray.__init__
184183
return cls(values, freq=freq, **kwargs)
185184

186185
@classmethod

0 commit comments

Comments
 (0)