Skip to content

Commit 93872a5

Browse files
committed
fix tz-aware unique
1 parent 3c5c7dc commit 93872a5

File tree

5 files changed

+81
-8
lines changed

5 files changed

+81
-8
lines changed

doc/source/whatsnew/v0.20.0.txt

+52
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,58 @@ result. On the other hand, this might have backward incompatibilities: e.g.
593593
compared to numpy arrays, ``Index`` objects are not mutable. To get the original
594594
ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``.
595595

596+
.. _whatsnew_0200.api_breaking.unique:
597+
598+
pd.unique will now be consistent with extension types
599+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
600+
601+
In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type.
602+
603+
Previous behaviour:
604+
605+
Datetime tz-aware
606+
607+
.. code-block:: ipython
608+
609+
In [5]: Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
610+
pd.Timestamp('20160101', tz='US/Eastern')])).unique()
611+
Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object)
612+
613+
In [7]: pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
614+
pd.Timestamp('20160101', tz='US/Eastern')])))
615+
Out[7]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]')
616+
617+
Categoricals
618+
619+
.. code-block:: ipython
620+
621+
622+
In [1]: pd.Series(pd.Categorical(list('aabc'))).unique()
623+
Out[1]:
624+
[a, b, c]
625+
Categories (3, object): [a, b, c]
626+
627+
In [2]: pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique())
628+
Out[2]: array(['a', 'b', 'c'], dtype=object)
629+
630+
New Behavior:
631+
632+
Datetime tz-aware
633+
634+
.. ipython:: python
635+
636+
Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
637+
pd.Timestamp('20160101', tz='US/Eastern')])).unique()
638+
pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
639+
pd.Timestamp('20160101', tz='US/Eastern')])))
640+
641+
Categoricals
642+
643+
.. ipython:: python
644+
645+
pd.Series(pd.Categorical(list('aabc'))).unique()
646+
pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique())
647+
596648
.. _whatsnew_0200.api_breaking.s3:
597649

598650
S3 File Handling

pandas/core/algorithms.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,10 @@ def unique(values):
278278
279279
Returns
280280
-------
281-
unique values. The returned type will be a pandas Index
282-
if a pandas type is input, otherwise ndarray
281+
unique values.
282+
- If the input is a Categorical dtype, the return is a Categorical
283+
- If the input is an Index, the return is an Index
284+
- If the input is a Series/ndarray, the return will be an ndarray
283285
284286
Examples
285287
--------
@@ -293,6 +295,11 @@ def unique(values):
293295
... pd.Timestamp('20160101')]))
294296
array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
295297
298+
>>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
299+
... pd.Timestamp('20160101', tz='US/Eastern')]))
300+
array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
301+
dtype=object)
302+
296303
>>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
297304
... pd.Timestamp('20160101', tz='US/Eastern')]))
298305
DatetimeIndex(['2016-01-01 00:00:00-05:00'],
@@ -309,6 +316,7 @@ def unique(values):
309316
Categories (3, object): [a, b, c]
310317
311318
"""
319+
312320
values = _ensure_arraylike(values)
313321

314322
# categorical is a fast-path
@@ -326,6 +334,13 @@ def unique(values):
326334
uniques = table.unique(values)
327335
uniques = _reconstruct_data(uniques, dtype, original)
328336

337+
if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
338+
# we are special casing datetime64tz_dtype
339+
# to return an object array of tz-aware Timestamps
340+
341+
# TODO: it must return DatetimeArray with tz in pandas 2.0
342+
uniques = uniques.asobject.values
343+
329344
return uniques
330345

331346

pandas/core/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,7 @@ def unique(self):
873873
else:
874874
from pandas.core.algorithms import unique1d
875875
result = unique1d(values)
876+
876877
return result
877878

878879
def nunique(self, dropna=True):

pandas/core/series.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1204,10 +1204,14 @@ def mode(self):
12041204
@Appender(base._shared_docs['unique'] % _shared_doc_kwargs)
12051205
def unique(self):
12061206
result = super(Series, self).unique()
1207+
12071208
if is_datetime64tz_dtype(self.dtype):
1208-
# to return array of Timestamp with tz
1209-
# ToDo: it must return DatetimeArray with tz in pandas 2.0
1210-
return result.asobject.values
1209+
# we are special casing datetime64tz_dtype
1210+
# to return an object array of tz-aware Timestamps
1211+
1212+
# TODO: it must return DatetimeArray with tz in pandas 2.0
1213+
result = result.asobject.values
1214+
12111215
return result
12121216

12131217
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)

pandas/tests/test_algos.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ def test_categorical(self):
401401

402402
def test_datetime64tz_aware(self):
403403
# GH 15939
404+
404405
result = Series(
405406
pd.Index([Timestamp('20160101', tz='US/Eastern'),
406407
Timestamp('20160101', tz='US/Eastern')])).unique()
@@ -417,9 +418,9 @@ def test_datetime64tz_aware(self):
417418
result = pd.unique(
418419
Series(pd.Index([Timestamp('20160101', tz='US/Eastern'),
419420
Timestamp('20160101', tz='US/Eastern')])))
420-
expected = DatetimeIndex(['2016-01-01 00:00:00'],
421-
dtype='datetime64[ns, US/Eastern]', freq=None)
422-
tm.assert_index_equal(result, expected)
421+
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
422+
tz='US/Eastern')], dtype=object)
423+
tm.assert_numpy_array_equal(result, expected)
423424

424425
result = pd.unique(pd.Index([Timestamp('20160101', tz='US/Eastern'),
425426
Timestamp('20160101', tz='US/Eastern')]))

0 commit comments

Comments
 (0)