Skip to content

Commit 5a20ea2

Browse files
sinhrksjorisvandenbossche
authored andcommitted
API: change unique to return Index (pandas-dev#13979)
1 parent ca2b104 commit 5a20ea2

File tree

11 files changed

+209
-88
lines changed

11 files changed

+209
-88
lines changed

doc/source/whatsnew/v0.19.0.txt

+31-2
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ API changes
457457
- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`)
458458
- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`)
459459
- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`)
460-
460+
- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`)
461461

462462

463463

@@ -904,6 +904,35 @@ New Behavior:
904904
idx1.difference(idx2)
905905
idx1.symmetric_difference(idx2)
906906

907+
.. _whatsnew_0190.api.unique_index:
908+
909+
``Index.unique`` consistently returns ``Index``
910+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
911+
912+
``Index.unique()`` now returns unique values as an
913+
``Index`` of the appropriate ``dtype``. (:issue:`13395`)
914+
915+
Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex``,
916+
``TimedeltaIndex`` and ``PeriodIndex`` returned ``Index`` to keep metadata like timezone.
917+
918+
Previous Behavior:
919+
920+
.. code-block:: ipython
921+
922+
In [1]: pd.Index([1, 2, 3]).unique()
923+
Out[1]: array([1, 2, 3])
924+
In [2]: pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique()
925+
Out[2]: DatetimeIndex(['2011-01-01 00:00:00+09:00', '2011-01-02 00:00:00+09:00',
926+
'2011-01-03 00:00:00+09:00'],
927+
dtype='datetime64[ns, Asia/Tokyo]', freq=None)
928+
929+
New Behavior:
930+
931+
.. ipython:: python
932+
933+
pd.Index([1, 2, 3]).unique()
934+
pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique()
935+
907936
.. _whatsnew_0190.api.autogenerated_chunksize_index:
908937

909938
``read_csv`` will progressively enumerate chunks
@@ -1181,6 +1210,7 @@ Bug Fixes
11811210
- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`)
11821211
- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`)
11831212

1213+
11841214
- Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`)
11851215
- Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`)
11861216
- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)
@@ -1248,7 +1278,6 @@ Bug Fixes
12481278

12491279
- Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`)
12501280
- Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`)
1251-
12521281
- Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`)
12531282
- Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`)
12541283

pandas/core/base.py

+18-13
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77

88
from pandas.types.missing import isnull
99
from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass
10-
from pandas.types.common import (is_object_dtype,
11-
is_list_like, is_scalar)
10+
from pandas.types.common import is_object_dtype, is_list_like, is_scalar
1211

1312
from pandas.core import common as com
1413
import pandas.core.nanops as nanops
@@ -21,7 +20,7 @@
2120

2221
_shared_docs = dict()
2322
_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='',
24-
duplicated='IndexOpsMixin')
23+
unique='IndexOpsMixin', duplicated='IndexOpsMixin')
2524

2625

2726
class StringMixin(object):
@@ -952,21 +951,27 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
952951
normalize=normalize, bins=bins, dropna=dropna)
953952
return result
954953

955-
def unique(self):
954+
_shared_docs['unique'] = (
956955
"""
957-
Return array of unique values in the object. Significantly faster than
958-
numpy.unique. Includes NA values.
956+
Return %(unique)s of unique values in the object.
957+
Significantly faster than numpy.unique. Includes NA values.
958+
The order of the original is preserved.
959959
960960
Returns
961961
-------
962-
uniques : ndarray
963-
"""
964-
from pandas.core.nanops import unique1d
965-
values = self.values
966-
if hasattr(values, 'unique'):
967-
return values.unique()
962+
uniques : %(unique)s
963+
""")
968964

969-
return unique1d(values)
965+
@Appender(_shared_docs['unique'] % _indexops_doc_kwargs)
966+
def unique(self):
967+
values = self._values
968+
969+
if hasattr(values, 'unique'):
970+
result = values.unique()
971+
else:
972+
from pandas.core.nanops import unique1d
973+
result = unique1d(values)
974+
return result
970975

971976
def nunique(self, dropna=True):
972977
"""

pandas/core/series.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
is_float_dtype,
1919
is_extension_type, is_datetimetz,
2020
is_datetimelike,
21+
is_datetime64tz_dtype,
2122
is_timedelta64_dtype,
2223
is_list_like,
2324
is_hashable,
@@ -77,7 +78,7 @@
7778
axes='index', klass='Series', axes_single_arg="{0, 'index'}",
7879
inplace="""inplace : boolean, default False
7980
If True, performs operation inplace and returns None.""",
80-
duplicated='Series',
81+
unique='np.ndarray', duplicated='Series',
8182
optional_by='')
8283

8384

@@ -1231,6 +1232,15 @@ def mode(self):
12311232
# TODO: Add option for bins like value_counts()
12321233
return algos.mode(self)
12331234

1235+
@Appender(base._shared_docs['unique'] % _shared_doc_kwargs)
1236+
def unique(self):
1237+
result = super(Series, self).unique()
1238+
if is_datetime64tz_dtype(self.dtype):
1239+
# to return array of Timestamp with tz
1240+
# ToDo: it must return DatetimeArray with tz in pandas 2.0
1241+
return result.asobject.values
1242+
return result
1243+
12341244
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
12351245
False: 'first'})
12361246
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)

pandas/indexes/base.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@
6060

6161
_unsortable_types = frozenset(('mixed', 'mixed-integer'))
6262

63-
_index_doc_kwargs = dict(klass='Index', inplace='', duplicated='np.array')
63+
_index_doc_kwargs = dict(klass='Index', inplace='',
64+
unique='Index', duplicated='np.ndarray')
6465
_index_shared_docs = dict()
6566

6667

@@ -3217,6 +3218,11 @@ def drop(self, labels, errors='raise'):
32173218
indexer = indexer[~mask]
32183219
return self.delete(indexer)
32193220

3221+
@Appender(base._shared_docs['unique'] % _index_doc_kwargs)
3222+
def unique(self):
3223+
result = super(Index, self).unique()
3224+
return self._shallow_copy(result)
3225+
32203226
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
32213227
False: 'first'})
32223228
@Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs)

pandas/indexes/category.py

+8
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,14 @@ def _engine(self):
283283
def is_unique(self):
284284
return not self.duplicated().any()
285285

286+
@Appender(base._shared_docs['unique'] % ibase._index_doc_kwargs)
287+
def unique(self):
288+
result = base.IndexOpsMixin.unique(self)
289+
# CategoricalIndex._shallow_copy uses keeps original categories
290+
# and ordered if not otherwise specified
291+
return self._shallow_copy(result, categories=result.categories,
292+
ordered=result.ordered)
293+
286294
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
287295
False: 'first'})
288296
@Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs)

pandas/tests/indexes/test_category.py

+1
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ def test_duplicates(self):
395395

396396
expected = CategoricalIndex([0], name='foo')
397397
self.assert_index_equal(idx.drop_duplicates(), expected)
398+
self.assert_index_equal(idx.unique(), expected)
398399

399400
def test_get_indexer(self):
400401

pandas/tests/indexes/test_multi.py

+32
Original file line numberDiff line numberDiff line change
@@ -1927,6 +1927,38 @@ def test_get_unique_index(self):
19271927
self.assertTrue(result.unique)
19281928
self.assert_index_equal(result, expected)
19291929

1930+
def test_unique(self):
1931+
mi = pd.MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]])
1932+
1933+
res = mi.unique()
1934+
exp = pd.MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]])
1935+
tm.assert_index_equal(res, exp)
1936+
1937+
mi = pd.MultiIndex.from_arrays([list('aaaa'), list('abab')])
1938+
res = mi.unique()
1939+
exp = pd.MultiIndex.from_arrays([list('aa'), list('ab')])
1940+
tm.assert_index_equal(res, exp)
1941+
1942+
mi = pd.MultiIndex.from_arrays([list('aaaa'), list('aaaa')])
1943+
res = mi.unique()
1944+
exp = pd.MultiIndex.from_arrays([['a'], ['a']])
1945+
tm.assert_index_equal(res, exp)
1946+
1947+
def test_unique_datetimelike(self):
1948+
idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
1949+
'2015-01-01', 'NaT', 'NaT'])
1950+
idx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02',
1951+
'2015-01-02', 'NaT', '2015-01-01'],
1952+
tz='Asia/Tokyo')
1953+
result = pd.MultiIndex.from_arrays([idx1, idx2]).unique()
1954+
1955+
eidx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT'])
1956+
eidx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-02',
1957+
'NaT', '2015-01-01'],
1958+
tz='Asia/Tokyo')
1959+
exp = pd.MultiIndex.from_arrays([eidx1, eidx2])
1960+
tm.assert_index_equal(result, exp)
1961+
19301962
def test_tolist(self):
19311963
result = self.index.tolist()
19321964
exp = list(self.index.values)

0 commit comments

Comments
 (0)