Skip to content

Commit f7c0bbc

Browse files
jbrockmendeljreback
authored andcommitted
Implement unique+array parts of 24024 (#24527)
1 parent 4b6be69 commit f7c0bbc

File tree

7 files changed

+100
-47
lines changed

7 files changed

+100
-47
lines changed

doc/source/whatsnew/v0.24.0.rst

+25
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,31 @@ is the case with :attr:`Period.end_time`, for example
673673
674674
p.end_time
675675
676+
.. _whatsnew_0240.api_breaking.datetime_unique:
677+
678+
The return type of :meth:`Series.unique` for datetime with timezone values has changed
679+
from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays.DatetimeArray` (:issue:`24024`).
680+
681+
.. ipython:: python
682+
683+
ser = pd.Series([pd.Timestamp('2000', tz='UTC'),
684+
pd.Timestamp('2000', tz='UTC')])
685+
686+
*Previous Behavior*:
687+
688+
.. code-block:: ipython
689+
690+
In [3]: ser.unique()
691+
Out[3]: array([Timestamp('2000-01-01 00:00:00+0000', tz='UTC')], dtype=object)
692+
693+
694+
*New Behavior*:
695+
696+
.. ipython:: python
697+
698+
ser.unique()
699+
700+
676701
.. _whatsnew_0240.api_breaking.sparse_values:
677702

678703
Sparse Data Structure Refactor

pandas/arrays/__init__.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,19 @@
55
"""
66
from pandas.core.arrays import (
77
IntervalArray, PeriodArray, Categorical, SparseArray, IntegerArray,
8-
PandasArray
8+
PandasArray,
9+
DatetimeArrayMixin as DatetimeArray,
10+
TimedeltaArrayMixin as TimedeltaArray,
911
)
1012

1113

1214
__all__ = [
1315
'Categorical',
16+
'DatetimeArray',
1417
'IntegerArray',
1518
'IntervalArray',
1619
'PandasArray',
1720
'PeriodArray',
1821
'SparseArray',
22+
'TimedeltaArray',
1923
]

pandas/core/algorithms.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -350,21 +350,16 @@ def unique(values):
350350
if is_extension_array_dtype(values):
351351
# Dispatch to extension dtype's unique.
352352
return values.unique()
353+
elif is_datetime64tz_dtype(values):
354+
# TODO: merge this check into the previous one following #24024
355+
return values.unique()
353356

354357
original = values
355358
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
356359

357360
table = htable(len(values))
358361
uniques = table.unique(values)
359362
uniques = _reconstruct_data(uniques, dtype, original)
360-
361-
if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
362-
# we are special casing datetime64tz_dtype
363-
# to return an object array of tz-aware Timestamps
364-
365-
# TODO: it must return DatetimeArray with tz in pandas 2.0
366-
uniques = uniques.astype(object).values
367-
368363
return uniques
369364

370365

pandas/core/base.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
from pandas.util._validators import validate_bool_kwarg
1616

1717
from pandas.core.dtypes.common import (
18-
is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype,
19-
is_extension_type, is_list_like, is_object_dtype, is_scalar)
18+
is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetimelike,
19+
is_extension_array_dtype, is_extension_type, is_list_like, is_object_dtype,
20+
is_scalar, is_timedelta64_ns_dtype)
2021
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
2122
from pandas.core.dtypes.missing import isna
2223

@@ -849,12 +850,19 @@ def array(self):
849850
"""
850851
result = self._values
851852

852-
# TODO(DatetimeArray): remvoe the second clause.
853-
if (not is_extension_array_dtype(result.dtype)
854-
and not is_datetime64tz_dtype(result.dtype)):
855-
from pandas.core.arrays.numpy_ import PandasArray
853+
if (is_datetime64_ns_dtype(result.dtype) or
854+
is_datetime64tz_dtype(result.dtype)):
855+
from pandas.arrays import DatetimeArray
856+
result = DatetimeArray(result)
857+
858+
elif is_timedelta64_ns_dtype(result.dtype):
859+
from pandas.arrays import TimedeltaArray
860+
result = TimedeltaArray(result)
856861

862+
elif not is_extension_array_dtype(result.dtype):
863+
from pandas.core.arrays.numpy_ import PandasArray
857864
result = PandasArray(result)
865+
858866
return result
859867

860868
def to_numpy(self, dtype=None, copy=False):

pandas/core/series.py

+22-17
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,9 @@
1717

1818
from pandas.core.dtypes.common import (
1919
_is_unorderable_exception, ensure_platform_int, is_bool,
20-
is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
21-
is_datetimelike, is_dict_like, is_extension_array_dtype, is_extension_type,
22-
is_hashable, is_integer, is_iterator, is_list_like, is_scalar,
23-
is_string_like, is_timedelta64_dtype)
20+
is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
21+
is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
22+
is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
2423
from pandas.core.dtypes.generic import (
2524
ABCDataFrame, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries)
2625
from pandas.core.dtypes.missing import (
@@ -1556,9 +1555,18 @@ def unique(self):
15561555
15571556
Returns
15581557
-------
1559-
ndarray or Categorical
1560-
The unique values returned as a NumPy array. In case of categorical
1561-
data type, returned as a Categorical.
1558+
ndarray or ExtensionArray
1559+
The unique values returned as a NumPy array. In case of an
1560+
extension-array backed Series, a new
1561+
:class:`~api.extensions.ExtensionArray` of that type with just
1562+
the unique values is returned. This includes
1563+
1564+
* Categorical
1565+
* Period
1566+
* Datetime with Timezone
1567+
* Interval
1568+
* Sparse
1569+
* IntegerNA
15621570
15631571
See Also
15641572
--------
@@ -1575,8 +1583,9 @@ def unique(self):
15751583
15761584
>>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern')
15771585
... for _ in range(3)]).unique()
1578-
array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
1579-
dtype=object)
1586+
<DatetimeArrayMixin>
1587+
['2016-01-01 00:00:00-05:00']
1588+
Length: 1, dtype: datetime64[ns, US/Eastern]
15801589
15811590
An unordered Categorical will return categories in the order of
15821591
appearance.
@@ -1593,14 +1602,10 @@ def unique(self):
15931602
Categories (3, object): [a < b < c]
15941603
"""
15951604
result = super(Series, self).unique()
1596-
1597-
if is_datetime64tz_dtype(self.dtype):
1598-
# we are special casing datetime64tz_dtype
1599-
# to return an object array of tz-aware Timestamps
1600-
1601-
# TODO: it must return DatetimeArray with tz in pandas 2.0
1602-
result = result.astype(object).values
1603-
1605+
if isinstance(result, DatetimeIndex):
1606+
# TODO: This should be unnecessary after Series._values returns
1607+
# DatetimeArray
1608+
result = result._eadata
16041609
return result
16051610

16061611
def drop_duplicates(self, keep='first', inplace=False):

pandas/tests/test_algos.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pandas._libs import (groupby as libgroupby, algos as libalgos,
1717
hashtable as ht)
1818
from pandas.compat import lrange, range
19+
from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray
1920
import pandas.core.algorithms as algos
2021
import pandas.core.common as com
2122
import pandas.util.testing as tm
@@ -456,9 +457,10 @@ def test_datetime64tz_aware(self):
456457
result = Series(
457458
Index([Timestamp('20160101', tz='US/Eastern'),
458459
Timestamp('20160101', tz='US/Eastern')])).unique()
459-
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
460-
tz='US/Eastern')], dtype=object)
461-
tm.assert_numpy_array_equal(result, expected)
460+
expected = DatetimeArray._from_sequence(np.array([
461+
Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern")
462+
]))
463+
tm.assert_extension_array_equal(result, expected)
462464

463465
result = Index([Timestamp('20160101', tz='US/Eastern'),
464466
Timestamp('20160101', tz='US/Eastern')]).unique()
@@ -469,9 +471,10 @@ def test_datetime64tz_aware(self):
469471
result = pd.unique(
470472
Series(Index([Timestamp('20160101', tz='US/Eastern'),
471473
Timestamp('20160101', tz='US/Eastern')])))
472-
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
473-
tz='US/Eastern')], dtype=object)
474-
tm.assert_numpy_array_equal(result, expected)
474+
expected = DatetimeArray._from_sequence(np.array([
475+
Timestamp('2016-01-01', tz="US/Eastern"),
476+
]))
477+
tm.assert_extension_array_equal(result, expected)
475478

476479
result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
477480
Timestamp('20160101', tz='US/Eastern')]))

pandas/tests/test_base.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,15 @@
1111
import pandas.compat as compat
1212
from pandas.core.dtypes.common import (
1313
is_object_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
14-
needs_i8_conversion)
14+
needs_i8_conversion, is_timedelta64_dtype)
1515
import pandas.util.testing as tm
1616
from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex,
1717
PeriodIndex, Timedelta, IntervalIndex, Interval,
1818
CategoricalIndex, Timestamp, DataFrame, Panel)
19+
from pandas.core.arrays import (
20+
DatetimeArrayMixin as DatetimeArray,
21+
TimedeltaArrayMixin as TimedeltaArray,
22+
)
1923
from pandas.compat import StringIO, PYPY, long
2024
from pandas.compat.numpy import np_array_datetime64_compat
2125
from pandas.core.arrays import PandasArray
@@ -383,8 +387,12 @@ def test_value_counts_unique_nunique(self):
383387
assert result[0] == orig[0]
384388
for r in result:
385389
assert isinstance(r, Timestamp)
386-
tm.assert_numpy_array_equal(result,
387-
orig._values.astype(object).values)
390+
391+
# TODO(#24024) once orig._values returns DTA, remove
392+
# the `._eadata` below
393+
tm.assert_numpy_array_equal(
394+
result.astype(object),
395+
orig._values._eadata.astype(object))
388396
else:
389397
tm.assert_numpy_array_equal(result, orig.values)
390398

@@ -410,7 +418,9 @@ def test_value_counts_unique_nunique_null(self):
410418
else:
411419
o = o.copy()
412420
o[0:2] = iNaT
413-
values = o._values
421+
# TODO(#24024) once Series._values returns DTA, remove
422+
# the `._eadata` here
423+
values = o._values._eadata
414424

415425
elif needs_i8_conversion(o):
416426
values[0:2] = iNaT
@@ -431,7 +441,7 @@ def test_value_counts_unique_nunique_null(self):
431441
o = klass(values.repeat(range(1, len(o) + 1)))
432442
o.name = 'a'
433443
else:
434-
if is_datetime64tz_dtype(o):
444+
if isinstance(o, DatetimeIndex):
435445
expected_index = orig._values._shallow_copy(values)
436446
else:
437447
expected_index = Index(values)
@@ -472,8 +482,7 @@ def test_value_counts_unique_nunique_null(self):
472482
Index(values[1:], name='a'))
473483
elif is_datetime64tz_dtype(o):
474484
# unable to compare NaT / nan
475-
vals = values[2:].astype(object).values
476-
tm.assert_numpy_array_equal(result[1:], vals)
485+
tm.assert_extension_array_equal(result[1:], values[2:])
477486
assert result[0] is pd.NaT
478487
else:
479488
tm.assert_numpy_array_equal(result[1:], values[2:])
@@ -1187,7 +1196,6 @@ def test_ndarray_values(array, expected):
11871196

11881197
@pytest.mark.parametrize("arr", [
11891198
np.array([1, 2, 3]),
1190-
np.array([1, 2, 3], dtype="datetime64[ns]"),
11911199
])
11921200
def test_numpy_array(arr):
11931201
ser = pd.Series(arr)
@@ -1199,7 +1207,12 @@ def test_numpy_array(arr):
11991207
def test_numpy_array_all_dtypes(any_numpy_dtype):
12001208
ser = pd.Series(dtype=any_numpy_dtype)
12011209
result = ser.array
1202-
assert isinstance(result, PandasArray)
1210+
if is_datetime64_dtype(any_numpy_dtype):
1211+
assert isinstance(result, DatetimeArray)
1212+
elif is_timedelta64_dtype(any_numpy_dtype):
1213+
assert isinstance(result, TimedeltaArray)
1214+
else:
1215+
assert isinstance(result, PandasArray)
12031216

12041217

12051218
@pytest.mark.parametrize("array, attr", [

0 commit comments

Comments
 (0)