Skip to content

Commit feb65ed

Browse files
committed
API: add "level=" argument to MultiIndex.unique()
closes pandas-dev#17896
1 parent b00e62c commit feb65ed

File tree

6 files changed

+94
-17
lines changed

6 files changed

+94
-17
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Other Enhancements
2424

2525
- Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`)
2626
- :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`)
27+
- :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`)
2728
- :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`)
2829
- :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`)
2930

pandas/core/indexes/base.py

+26-2
Original file line numberDiff line numberDiff line change
@@ -3757,8 +3757,32 @@ def drop(self, labels, errors='raise'):
37573757
indexer = indexer[~mask]
37583758
return self.delete(indexer)
37593759

3760-
@Appender(base._shared_docs['unique'] % _index_doc_kwargs)
3761-
def unique(self):
3760+
_index_shared_docs['index_unique'] = (
3761+
"""
3762+
Return unique values in the index. Uniques are returned in order
3763+
of appearance, this does NOT sort.
3764+
3765+
Parameters
3766+
----------
3767+
level : int or str, optional, default None
3768+
Only return values from specified level (for MultiIndex)
3769+
3770+
.. versionadded:: 0.22.0
3771+
3772+
Returns
3773+
-------
3774+
Index without duplicates
3775+
3776+
See Also
3777+
--------
3778+
unique
3779+
Series.unique
3780+
""")
3781+
3782+
@Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
3783+
def unique(self, level=None):
3784+
if level is not None:
3785+
self._validate_index_level(level)
37623786
result = super(Index, self).unique()
37633787
return self._shallow_copy(result)
37643788

pandas/core/indexes/category.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -378,8 +378,10 @@ def is_monotonic_increasing(self):
378378
def is_monotonic_decreasing(self):
379379
return Index(self.codes).is_monotonic_decreasing
380380

381-
@Appender(base._shared_docs['unique'] % _index_doc_kwargs)
382-
def unique(self):
381+
@Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
382+
def unique(self, level=None):
383+
if level is not None:
384+
self._validate_index_level(level)
383385
result = base.IndexOpsMixin.unique(self)
384386
# CategoricalIndex._shallow_copy uses keeps original categories
385387
# and ordered if not otherwise specified

pandas/core/indexes/multi.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,7 @@ def _try_mi(k):
908908

909909
raise InvalidIndexError(key)
910910

911-
def _get_level_values(self, level):
911+
def _get_level_values(self, level, unique=False):
912912
"""
913913
Return vector of label values for requested level,
914914
equal to the length of the index
@@ -918,17 +918,21 @@ def _get_level_values(self, level):
918918
Parameters
919919
----------
920920
level : int level
921+
unique : bool, default False
922+
if True, drop duplicated values
921923
922924
Returns
923925
-------
924926
values : ndarray
925927
"""
926928

927-
unique = self.levels[level]
929+
values = self.levels[level]
928930
labels = self.labels[level]
929-
filled = algos.take_1d(unique._values, labels,
930-
fill_value=unique._na_value)
931-
values = unique._shallow_copy(filled)
931+
if unique:
932+
labels = algos.unique(labels)
933+
filled = algos.take_1d(values._values, labels,
934+
fill_value=values._na_value)
935+
values = values._shallow_copy(filled)
932936
return values
933937

934938
def get_level_values(self, level):
@@ -967,6 +971,15 @@ def get_level_values(self, level):
967971
values = self._get_level_values(level)
968972
return values
969973

974+
@Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
975+
def unique(self, level=None):
976+
977+
if level is None:
978+
return super(MultiIndex, self).unique()
979+
else:
980+
level = self._get_level_number(level)
981+
return self._get_level_values(level=level, unique=True)
982+
970983
def format(self, space=2, sparsify=None, adjoin=True, names=False,
971984
na_rep=None, formatter=None):
972985
if len(self) == 0:

pandas/tests/indexes/common.py

+21
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,27 @@ def test_duplicates(self, indices):
329329
assert not idx.is_unique
330330
assert idx.has_duplicates
331331

332+
def test_unique(self, indices):
333+
# don't test a MultiIndex here (as its tested separated)
334+
# don't test a CategoricalIndex because categories change (GH 18291)
335+
if isinstance(indices, (MultiIndex, CategoricalIndex)):
336+
return
337+
338+
# GH 17896
339+
expected = indices.drop_duplicates()
340+
for level in 0, indices.name, None:
341+
result = indices.unique(level=level)
342+
tm.assert_index_equal(result, expected)
343+
344+
for level in 3, 'wrong':
345+
pytest.raises((IndexError, KeyError), indices.unique, level=level)
346+
347+
def test_unique_na(self):
348+
idx = pd.Index([2, np.nan, 2, 1], name='my_index')
349+
expected = pd.Index([2, np.nan, 1], name='my_index')
350+
result = idx.unique()
351+
tm.assert_index_equal(result, expected)
352+
332353
def test_get_unique_index(self, indices):
333354
# MultiIndex tested separately
334355
if not len(indices) or isinstance(indices, MultiIndex):

pandas/tests/indexes/test_multi.py

+24-8
Original file line numberDiff line numberDiff line change
@@ -963,19 +963,21 @@ def test_get_level_values(self):
963963
exp = CategoricalIndex([1, 2, 3, 1, 2, 3])
964964
tm.assert_index_equal(index.get_level_values(1), exp)
965965

966-
def test_get_level_values_na(self):
966+
@pytest.mark.xfail(reason='GH 17924 (returns Int64Index with float data)')
967+
def test_get_level_values_int_with_na(self):
967968
arrays = [['a', 'b', 'b'], [1, np.nan, 2]]
968969
index = pd.MultiIndex.from_arrays(arrays)
969-
values = index.get_level_values(1)
970-
expected = np.array([1, np.nan, 2])
971-
tm.assert_numpy_array_equal(values.values.astype(float), expected)
970+
result = index.get_level_values(1)
971+
expected = Index([1, np.nan, 2])
972+
tm.assert_index_equal(result, expected)
972973

973974
arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]]
974975
index = pd.MultiIndex.from_arrays(arrays)
975-
values = index.get_level_values(1)
976-
expected = np.array([np.nan, np.nan, 2])
977-
tm.assert_numpy_array_equal(values.values.astype(float), expected)
976+
result = index.get_level_values(1)
977+
expected = Index([np.nan, np.nan, 2])
978+
tm.assert_index_equal(result, expected)
978979

980+
def test_get_level_values_na(self):
979981
arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]]
980982
index = pd.MultiIndex.from_arrays(arrays)
981983
result = index.get_level_values(0)
@@ -990,7 +992,7 @@ def test_get_level_values_na(self):
990992
index = pd.MultiIndex.from_arrays(arrays)
991993
values = index.get_level_values(1)
992994
expected = pd.DatetimeIndex([0, 1, pd.NaT])
993-
tm.assert_numpy_array_equal(values.values, expected.values)
995+
tm.assert_index_equal(values, expected)
994996

995997
arrays = [[], []]
996998
index = pd.MultiIndex.from_arrays(arrays)
@@ -2277,6 +2279,20 @@ def test_unique(self):
22772279
exp = pd.MultiIndex.from_arrays([['a'], ['a']])
22782280
tm.assert_index_equal(res, exp)
22792281

2282+
@pytest.mark.parametrize('level', [0, 'first', 1, 'second'])
2283+
def test_unique_level(self, level):
2284+
# GH #17896 - with level= argument
2285+
result = self.index.unique(level=level)
2286+
expected = self.index.get_level_values(level).unique()
2287+
tm.assert_index_equal(result, expected)
2288+
2289+
# With already unique level
2290+
mi = pd.MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]],
2291+
names=['first', 'second'])
2292+
result = mi.unique(level=level)
2293+
expected = mi.get_level_values(level)
2294+
tm.assert_index_equal(result, expected)
2295+
22802296
def test_unique_datetimelike(self):
22812297
idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
22822298
'2015-01-01', 'NaT', 'NaT'])

0 commit comments

Comments
 (0)