Skip to content

Commit d398e5b

Browse files
committed
fix tz-aware unique
1 parent 656963c commit d398e5b

File tree

5 files changed

+81
-8
lines changed

5 files changed

+81
-8
lines changed

doc/source/whatsnew/v0.20.0.txt

+52
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,58 @@ result. On the other hand, this might have backward incompatibilities: e.g.
594594
compared to numpy arrays, ``Index`` objects are not mutable. To get the original
595595
ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``.
596596

597+
.. _whatsnew_0200.api_breaking.unique:
598+
599+
pd.unique will now be consistent with extension types
600+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
601+
602+
In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type.
603+
604+
Previous behaviour:
605+
606+
Datetime tz-aware
607+
608+
.. code-block:: ipython
609+
610+
In [5]: Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
611+
pd.Timestamp('20160101', tz='US/Eastern')])).unique()
612+
Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object)
613+
614+
In [7]: pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
615+
pd.Timestamp('20160101', tz='US/Eastern')])))
616+
Out[7]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]')
617+
618+
Categoricals
619+
620+
.. code-block:: ipython
621+
622+
623+
In [1]: pd.Series(pd.Categorical(list('aabc'))).unique()
624+
Out[1]:
625+
[a, b, c]
626+
Categories (3, object): [a, b, c]
627+
628+
In [2]: pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique())
629+
Out[2]: array(['a', 'b', 'c'], dtype=object)
630+
631+
New Behavior:
632+
633+
Datetime tz-aware
634+
635+
.. ipython:: python
636+
637+
Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
638+
pd.Timestamp('20160101', tz='US/Eastern')])).unique()
639+
pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
640+
pd.Timestamp('20160101', tz='US/Eastern')])))
641+
642+
Categoricals
643+
644+
.. ipython:: python
645+
646+
pd.Series(pd.Categorical(list('aabc'))).unique()
647+
pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique())
648+
597649
.. _whatsnew_0200.api_breaking.s3:
598650

599651
S3 File Handling

pandas/core/algorithms.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,10 @@ def unique(values):
278278
279279
Returns
280280
-------
281-
unique values. The returned type will be a pandas Index
282-
if a pandas type is input, otherwise ndarray
281+
unique values.
282+
- If the input is a Categorical dtype, the return is a Categorical
283+
- If the input is an Index, the return is an Index
284+
- If the input is a Series/ndarray, the return will be an ndarray
283285
284286
Examples
285287
--------
@@ -293,6 +295,11 @@ def unique(values):
293295
... pd.Timestamp('20160101')]))
294296
array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
295297
298+
>>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
299+
... pd.Timestamp('20160101', tz='US/Eastern')]))
300+
array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
301+
dtype=object)
302+
296303
>>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
297304
... pd.Timestamp('20160101', tz='US/Eastern')]))
298305
DatetimeIndex(['2016-01-01 00:00:00-05:00'],
@@ -309,6 +316,7 @@ def unique(values):
309316
Categories (3, object): [a, b, c]
310317
311318
"""
319+
312320
values = _ensure_arraylike(values)
313321

314322
# categorical is a fast-path
@@ -326,6 +334,13 @@ def unique(values):
326334
uniques = table.unique(values)
327335
uniques = _reconstruct_data(uniques, dtype, original)
328336

337+
if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
338+
# we are special casing datetime64tz_dtype
339+
# to return an object array of tz-aware Timestamps
340+
341+
# TODO: it must return DatetimeArray with tz in pandas 2.0
342+
uniques = uniques.asobject.values
343+
329344
return uniques
330345

331346

pandas/core/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,7 @@ def unique(self):
873873
else:
874874
from pandas.core.algorithms import unique1d
875875
result = unique1d(values)
876+
876877
return result
877878

878879
def nunique(self, dropna=True):

pandas/core/series.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1204,10 +1204,14 @@ def mode(self):
12041204
@Appender(base._shared_docs['unique'] % _shared_doc_kwargs)
12051205
def unique(self):
12061206
result = super(Series, self).unique()
1207+
12071208
if is_datetime64tz_dtype(self.dtype):
1208-
# to return array of Timestamp with tz
1209-
# ToDo: it must return DatetimeArray with tz in pandas 2.0
1210-
return result.asobject.values
1209+
# we are special casing datetime64tz_dtype
1210+
# to return an object array of tz-aware Timestamps
1211+
1212+
# TODO: it must return DatetimeArray with tz in pandas 2.0
1213+
result = result.asobject.values
1214+
12111215
return result
12121216

12131217
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)

pandas/tests/test_algos.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ def test_categorical(self):
401401

402402
def test_datetime64tz_aware(self):
403403
# GH 15939
404+
404405
result = Series(
405406
pd.Index([Timestamp('20160101', tz='US/Eastern'),
406407
Timestamp('20160101', tz='US/Eastern')])).unique()
@@ -417,9 +418,9 @@ def test_datetime64tz_aware(self):
417418
result = pd.unique(
418419
Series(pd.Index([Timestamp('20160101', tz='US/Eastern'),
419420
Timestamp('20160101', tz='US/Eastern')])))
420-
expected = DatetimeIndex(['2016-01-01 00:00:00'],
421-
dtype='datetime64[ns, US/Eastern]', freq=None)
422-
tm.assert_index_equal(result, expected)
421+
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
422+
tz='US/Eastern')], dtype=object)
423+
tm.assert_numpy_array_equal(result, expected)
423424

424425
result = pd.unique(pd.Index([Timestamp('20160101', tz='US/Eastern'),
425426
Timestamp('20160101', tz='US/Eastern')]))

0 commit comments

Comments
 (0)