Skip to content

Commit c3c60f0

Browse files
jrebackjorisvandenbossche
authored andcommitted
DOC/API/TST: add pd.unique doc-string & consistent return value for Categorical/tz-aware datetime (#15939)
closes #9346
1 parent 56c2019 commit c3c60f0

File tree

6 files changed

+371
-66
lines changed

6 files changed

+371
-66
lines changed

doc/source/whatsnew/v0.20.0.txt

+71
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,76 @@ result. On the other hand, this might have backward incompatibilities: e.g.
593593
compared to numpy arrays, ``Index`` objects are not mutable. To get the original
594594
ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``.
595595

596+
.. _whatsnew_0200.api_breaking.unique:
597+
598+
pd.unique will now be consistent with extension types
599+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
600+
601+
In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware
602+
datatypes would yield different return types. These are now made consistent. (:issue:`15903`)
603+
604+
- Datetime tz-aware
605+
606+
Previous behaviour:
607+
608+
.. code-block:: ipython
609+
610+
# Series
611+
In [5]: pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
612+
pd.Timestamp('20160101', tz='US/Eastern')]).unique()
613+
Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object)
614+
615+
In [6]: pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
616+
pd.Timestamp('20160101', tz='US/Eastern')]))
617+
Out[6]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]')
618+
619+
# Index
620+
In [7]: pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
621+
pd.Timestamp('20160101', tz='US/Eastern')]).unique()
622+
Out[7]: DatetimeIndex(['2016-01-01 00:00:00-05:00'], dtype='datetime64[ns, US/Eastern]', freq=None)
623+
624+
In [8]: pd.unique([pd.Timestamp('20160101', tz='US/Eastern'),
625+
pd.Timestamp('20160101', tz='US/Eastern')])
626+
Out[8]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]')
627+
628+
New Behavior:
629+
630+
.. ipython:: python
631+
632+
# Series, returns an array of Timestamp tz-aware
633+
pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
634+
pd.Timestamp('20160101', tz='US/Eastern')]).unique()
635+
pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
636+
pd.Timestamp('20160101', tz='US/Eastern')]))
637+
638+
# Index, returns a DatetimeIndex
639+
pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
640+
pd.Timestamp('20160101', tz='US/Eastern')]).unique()
641+
pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
642+
pd.Timestamp('20160101', tz='US/Eastern')]))
643+
644+
- Categoricals
645+
646+
Previous behaviour:
647+
648+
.. code-block:: ipython
649+
650+
In [1]: pd.Series(pd.Categorical(list('baabc'))).unique()
651+
Out[1]:
652+
[b, a, c]
653+
Categories (3, object): [b, a, c]
654+
655+
In [2]: pd.unique(pd.Series(pd.Categorical(list('baabc'))))
656+
Out[2]: array(['b', 'a', 'c'], dtype=object)
657+
658+
New Behavior:
659+
660+
.. ipython:: python
661+
662+
# returns a Categorical
663+
pd.Series(pd.Categorical(list('baabc'))).unique()
664+
pd.unique(pd.Series(pd.Categorical(list('baabc'))).unique())
665+
596666
.. _whatsnew_0200.api_breaking.s3:
597667

598668
S3 File Handling
@@ -1148,6 +1218,7 @@ Conversion
11481218
- Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`)
11491219
- Bug in ``DataFrame.fillna()`` with tz-aware datetimes (:issue:`15855`)
11501220
- Bug in ``is_string_dtype``, ``is_timedelta64_ns_dtype``, and ``is_string_like_dtype`` in which an error was raised when ``None`` was passed in (:issue:`15941`)
1221+
- Bug in the return type of ``pd.unique`` on a ``Categorical``, which was returning an ndarray and not a ``Categorical`` (:issue:`15903`)
11511222

11521223
Indexing
11531224
^^^^^^^^

pandas/core/algorithms.py

+85-4
Original file line numberDiff line numberDiff line change
@@ -267,22 +267,103 @@ def match(to_match, values, na_sentinel=-1):
267267
return result
268268

269269

270-
def unique1d(values):
270+
def unique(values):
271271
"""
272-
Hash table-based unique
272+
Hash table-based unique. Uniques are returned in order
273+
of appearance. This does NOT sort.
274+
275+
Significantly faster than numpy.unique. Includes NA values.
276+
277+
Parameters
278+
----------
279+
values : 1d array-like
280+
281+
Returns
282+
-------
283+
unique values.
284+
- If the input is an Index, the return is an Index
285+
- If the input is a Categorical dtype, the return is a Categorical
286+
- If the input is a Series/ndarray, the return will be an ndarray
287+
288+
Examples
289+
--------
290+
pd.unique(pd.Series([2, 1, 3, 3]))
291+
array([2, 1, 3])
292+
293+
>>> pd.unique(pd.Series([2] + [1] * 5))
294+
array([2, 1])
295+
296+
>>> pd.unique(Series([pd.Timestamp('20160101'),
297+
... pd.Timestamp('20160101')]))
298+
array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
299+
300+
>>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
301+
... pd.Timestamp('20160101', tz='US/Eastern')]))
302+
array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
303+
dtype=object)
304+
305+
>>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
306+
... pd.Timestamp('20160101', tz='US/Eastern')]))
307+
DatetimeIndex(['2016-01-01 00:00:00-05:00'],
308+
... dtype='datetime64[ns, US/Eastern]', freq=None)
309+
310+
>>> pd.unique(list('baabc'))
311+
array(['b', 'a', 'c'], dtype=object)
312+
313+
An unordered Categorical will return categories in the
314+
order of appearance.
315+
316+
>>> pd.unique(Series(pd.Categorical(list('baabc'))))
317+
[b, a, c]
318+
Categories (3, object): [b, a, c]
319+
320+
>>> pd.unique(Series(pd.Categorical(list('baabc'),
321+
... categories=list('abc'))))
322+
[b, a, c]
323+
Categories (3, object): [b, a, c]
324+
325+
An ordered Categorical preserves the category ordering.
326+
327+
>>> pd.unique(Series(pd.Categorical(list('baabc'),
328+
... categories=list('abc'),
329+
... ordered=True)))
330+
[b, a, c]
331+
Categories (3, object): [a < b < c]
332+
333+
See Also
334+
--------
335+
pandas.Index.unique
336+
pandas.Series.unique
337+
273338
"""
339+
274340
values = _ensure_arraylike(values)
341+
342+
# categorical is a fast-path
343+
# this will coerce Categorical, CategoricalIndex,
344+
# and category dtypes Series to same return of Category
345+
if is_categorical_dtype(values):
346+
values = getattr(values, '.values', values)
347+
return values.unique()
348+
275349
original = values
276350
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
277351

278352
table = htable(len(values))
279353
uniques = table.unique(values)
280354
uniques = _reconstruct_data(uniques, dtype, original)
281355

356+
if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
357+
# we are special casing datetime64tz_dtype
358+
# to return an object array of tz-aware Timestamps
359+
360+
# TODO: it must return DatetimeArray with tz in pandas 2.0
361+
uniques = uniques.asobject.values
362+
282363
return uniques
283364

284365

285-
unique = unique1d
366+
unique1d = unique
286367

287368

288369
def isin(comps, values):
@@ -651,7 +732,7 @@ def mode(values):
651732
if is_categorical_dtype(values):
652733

653734
if isinstance(values, Series):
654-
return Series(values.values.mode())
735+
return Series(values.values.mode(), name=values.name)
655736
return values.mode()
656737

657738
values, dtype, ndtype = _ensure_data(values)

pandas/core/base.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -855,13 +855,24 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
855855

856856
_shared_docs['unique'] = (
857857
"""
858-
Return %(unique)s of unique values in the object.
859-
Significantly faster than numpy.unique. Includes NA values.
860-
The order of the original is preserved.
858+
Return unique values in the object. Uniques are returned in order
859+
of appearance, this does NOT sort. Hash table-based unique.
860+
861+
Parameters
862+
----------
863+
values : 1d array-like
861864
862865
Returns
863866
-------
864-
uniques : %(unique)s
867+
unique values.
868+
- If the input is an Index, the return is an Index
869+
- If the input is a Categorical dtype, the return is a Categorical
870+
- If the input is a Series/ndarray, the return will be an ndarray
871+
872+
See Also
873+
--------
874+
pandas.unique
875+
pandas.Categorical.unique
865876
""")
866877

867878
@Appender(_shared_docs['unique'] % _indexops_doc_kwargs)
@@ -873,6 +884,7 @@ def unique(self):
873884
else:
874885
from pandas.core.algorithms import unique1d
875886
result = unique1d(values)
887+
876888
return result
877889

878890
def nunique(self, dropna=True):

pandas/core/categorical.py

+27
Original file line numberDiff line numberDiff line change
@@ -1895,6 +1895,33 @@ def unique(self):
18951895
Returns
18961896
-------
18971897
unique values : ``Categorical``
1898+
1899+
Examples
1900+
--------
1901+
An unordered Categorical will return categories in the
1902+
order of appearance.
1903+
1904+
>>> pd.Categorical(list('baabc'))
1905+
[b, a, c]
1906+
Categories (3, object): [b, a, c]
1907+
1908+
>>> pd.Categorical(list('baabc'), categories=list('abc'))
1909+
[b, a, c]
1910+
Categories (3, object): [b, a, c]
1911+
1912+
An ordered Categorical preserves the category ordering.
1913+
1914+
>>> pd.Categorical(list('baabc'),
1915+
... categories=list('abc'),
1916+
... ordered=True)
1917+
[b, a, c]
1918+
Categories (3, object): [a < b < c]
1919+
1920+
See Also
1921+
--------
1922+
pandas.unique
1923+
pandas.CategoricalIndex.unique
1924+
18981925
"""
18991926

19001927
# unlike np.unique, unique1d does not sort

pandas/core/series.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1204,10 +1204,14 @@ def mode(self):
12041204
@Appender(base._shared_docs['unique'] % _shared_doc_kwargs)
12051205
def unique(self):
12061206
result = super(Series, self).unique()
1207+
12071208
if is_datetime64tz_dtype(self.dtype):
1208-
# to return array of Timestamp with tz
1209-
# ToDo: it must return DatetimeArray with tz in pandas 2.0
1210-
return result.asobject.values
1209+
# we are special casing datetime64tz_dtype
1210+
# to return an object array of tz-aware Timestamps
1211+
1212+
# TODO: it must return DatetimeArray with tz in pandas 2.0
1213+
result = result.asobject.values
1214+
12111215
return result
12121216

12131217
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)

0 commit comments

Comments
 (0)