diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 6cf9003500b61..afe651d22c6a7 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -15,13 +15,25 @@ def idx(): major_labels = np.array([0, 0, 1, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) index_names = ['first', 'second'] - index = MultiIndex( - levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], - names=index_names, - verify_integrity=False - ) - return index + mi = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=index_names, verify_integrity=False) + return mi + + +@pytest.fixture +def idx_dup(): + # compare tests/indexes/multi/conftest.py + major_axis = Index(['foo', 'bar', 'baz', 'qux']) + minor_axis = Index(['one', 'two']) + + major_labels = np.array([0, 0, 1, 0, 1, 1]) + minor_labels = np.array([0, 1, 0, 1, 0, 1]) + index_names = ['first', 'second'] + mi = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=index_names, verify_integrity=False) + return mi @pytest.fixture diff --git a/pandas/tests/indexes/multi/test_unique_and_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py similarity index 58% rename from pandas/tests/indexes/multi/test_unique_and_duplicates.py rename to pandas/tests/indexes/multi/test_duplicates.py index c1000e5b6e0f6..1cdf0ca6e013e 100644 --- a/pandas/tests/indexes/multi/test_unique_and_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -2,56 +2,54 @@ import warnings from itertools import product +import pytest import numpy as np -import pandas as pd -import pandas.util.testing as tm -import pytest -from pandas import MultiIndex + from pandas.compat import range, u +from pandas import MultiIndex, DatetimeIndex +from pandas._libs import hashtable +import pandas.util.testing as tm @pytest.mark.parametrize('names', [None, ['first', 'second']]) def test_unique(names): - mi = pd.MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], - names=names) + mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) res = mi.unique() - exp = pd.MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names) + exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names) tm.assert_index_equal(res, exp) - mi = pd.MultiIndex.from_arrays([list('aaaa'), list('abab')], - names=names) + mi = MultiIndex.from_arrays([list('aaaa'), list('abab')], + names=names) res = mi.unique() - exp = pd.MultiIndex.from_arrays([list('aa'), list('ab')], - names=mi.names) + exp = MultiIndex.from_arrays([list('aa'), list('ab')], names=mi.names) tm.assert_index_equal(res, exp) - mi = pd.MultiIndex.from_arrays([list('aaaa'), list('aaaa')], - names=names) + mi = MultiIndex.from_arrays([list('aaaa'), list('aaaa')], names=names) res = mi.unique() - exp = pd.MultiIndex.from_arrays([['a'], ['a']], names=mi.names) + exp = MultiIndex.from_arrays([['a'], ['a']], names=mi.names) tm.assert_index_equal(res, exp) # GH #20568 - empty MI - mi = pd.MultiIndex.from_arrays([[], []], names=names) + mi = MultiIndex.from_arrays([[], []], names=names) res = mi.unique() tm.assert_index_equal(mi, res) def test_unique_datetimelike(): - idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', - '2015-01-01', 'NaT', 'NaT']) - idx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02', - '2015-01-02', 'NaT', '2015-01-01'], - tz='Asia/Tokyo') - result = pd.MultiIndex.from_arrays([idx1, idx2]).unique() - - eidx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT']) - eidx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-02', - 'NaT', '2015-01-01'], - tz='Asia/Tokyo') - exp = pd.MultiIndex.from_arrays([eidx1, eidx2]) + idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', + '2015-01-01', 'NaT', 'NaT']) + idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02', + '2015-01-02', 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + result = MultiIndex.from_arrays([idx1, idx2]).unique() + + eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT']) + eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02', + 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + exp = MultiIndex.from_arrays([eidx1, eidx2]) tm.assert_index_equal(result, exp) @@ -63,41 +61,51 @@ def test_unique_level(idx, level): tm.assert_index_equal(result, expected) # With already unique level - mi = pd.MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], - names=['first', 'second']) + mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], + names=['first', 'second']) result = mi.unique(level=level) expected = mi.get_level_values(level) tm.assert_index_equal(result, expected) # With empty MI - mi = pd.MultiIndex.from_arrays([[], []], names=['first', 'second']) + mi = MultiIndex.from_arrays([[], []], names=['first', 'second']) result = mi.unique(level=level) expected = mi.get_level_values(level) +@pytest.mark.parametrize('dropna', [True, False]) +def test_get_unique_index(idx, dropna): + mi = idx[[0, 1, 0, 1, 1, 0, 0]] + expected = mi._shallow_copy(mi[[0, 1]]) + + result = mi._get_unique_index(dropna=dropna) + assert result.unique + tm.assert_index_equal(result, expected) + + def test_duplicate_multiindex_labels(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError with pytest.raises(ValueError): - ind = pd.MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) + mi = MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) # And that using set_levels with duplicate levels fails - ind = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], - [1, 2, 1, 2, 3]]) + mi = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], + [1, 2, 1, 2, 3]]) with pytest.raises(ValueError): - ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], - inplace=True) + mi.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], + inplace=True) @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], [1, 'a', 1]]) def test_duplicate_level_names(names): # GH18872, GH19029 - mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names) + mi = MultiIndex.from_product([[0, 1]] * 3, names=names) assert mi.names == names # With .rename() - mi = pd.MultiIndex.from_product([[0, 1]] * 3) + mi = MultiIndex.from_product([[0, 1]] * 3) mi = mi.rename(names) assert mi.names == names @@ -109,27 +117,34 @@ def test_duplicate_level_names(names): def test_duplicate_meta_data(): # GH 10115 - index = MultiIndex( + mi = MultiIndex( levels=[[0, 1], [0, 1, 2]], labels=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]) - for idx in [index, - index.set_names([None, None]), - index.set_names([None, 'Num']), - index.set_names(['Upper', 'Num']), ]: + for idx in [mi, + mi.set_names([None, None]), + mi.set_names([None, 'Num']), + mi.set_names(['Upper', 'Num']), ]: assert idx.has_duplicates assert idx.drop_duplicates().names == idx.names -def test_duplicates(idx): +def test_has_duplicates(idx, idx_dup): + # see fixtures + assert idx.is_unique assert not idx.has_duplicates - assert idx.append(idx).has_duplicates + assert not idx_dup.is_unique + assert idx_dup.has_duplicates - index = MultiIndex(levels=[[0, 1], [0, 1, 2]], labels=[ - [0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]) - assert index.has_duplicates + mi = MultiIndex(levels=[[0, 1], [0, 1, 2]], + labels=[[0, 0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 0, 1, 2]]) + assert not mi.is_unique + assert mi.has_duplicates + +def test_has_duplicates_from_tuples(): # GH 9075 t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169), (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119), @@ -150,9 +165,11 @@ def test_duplicates(idx): (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123), (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)] - index = pd.MultiIndex.from_tuples(t) - assert not index.has_duplicates + mi = MultiIndex.from_tuples(t) + assert not mi.has_duplicates + +def test_has_duplicates_overflow(): # handle int64 overflow if possible def check(nlevels, with_nulls): labels = np.tile(np.arange(500), 2) @@ -171,20 +188,20 @@ def check(nlevels, with_nulls): levels = [level] * nlevels + [[0, 1]] # no dups - index = MultiIndex(levels=levels, labels=labels) - assert not index.has_duplicates + mi = MultiIndex(levels=levels, labels=labels) + assert not mi.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) labels = list(map(f, labels)) - index = MultiIndex(levels=levels, labels=labels) + mi = MultiIndex(levels=levels, labels=labels) else: - values = index.values.tolist() - index = MultiIndex.from_tuples(values + [values[0]]) + values = mi.values.tolist() + mi = MultiIndex.from_tuples(values + [values[0]]) - assert index.has_duplicates + assert mi.has_duplicates # no overflow check(4, False) @@ -194,17 +211,31 @@ def f(a): check(8, False) check(8, True) + +@pytest.mark.parametrize('keep, expected', [ + ('first', np.array([False, False, False, True, True, False])), + ('last', np.array([False, True, True, False, False, False])), + (False, np.array([False, True, True, True, True, False])) +]) +def test_duplicated(idx_dup, keep, expected): + result = idx_dup.duplicated(keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('keep', ['first', 'last', False]) +def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] labels = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, labels=labels) - for keep in ['first', 'last', False]: - left = mi.duplicated(keep=keep) - right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep) - tm.assert_numpy_array_equal(left, right) + result = mi.duplicated(keep=keep) + expected = hashtable.duplicated_object(mi.values, keep=keep) + tm.assert_numpy_array_equal(result, expected) + +def test_get_duplicates(): # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) @@ -212,11 +243,10 @@ def f(a): with warnings.catch_warnings(record=True): # Deprecated - see GH20239 - assert mi.get_duplicates().equals(MultiIndex.from_arrays( - [[], []])) + assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) - tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( - 2, dtype='bool')) + tm.assert_numpy_array_equal(mi.duplicated(), + np.zeros(2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape @@ -232,28 +262,5 @@ def f(a): assert mi.get_duplicates().equals(MultiIndex.from_arrays( [[], []])) - tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( - len(mi), dtype='bool')) - - -def test_get_unique_index(idx): - idx = idx[[0, 1, 0, 1, 1, 0, 0]] - expected = idx._shallow_copy(idx[[0, 1]]) - - for dropna in [False, True]: - result = idx._get_unique_index(dropna=dropna) - assert result.unique - tm.assert_index_equal(result, expected) - - -def test_unique_na(): - idx = pd.Index([2, np.nan, 2, 1], name='my_index') - expected = pd.Index([2, np.nan, 1], name='my_index') - result = idx.unique() - tm.assert_index_equal(result, expected) - - -def test_duplicate_level_names_access_raises(idx): - idx.names = ['foo', 'foo'] - tm.assert_raises_regex(ValueError, 'name foo occurs multiple times', - idx._get_level_number, 'foo') + tm.assert_numpy_array_equal(mi.duplicated(), + np.zeros(len(mi), dtype='bool')) diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index a9fbb55679173..68e8bb0cf58f2 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -115,3 +115,10 @@ def test_names(idx, index_names): ind_names = list(index.names) level_names = [level.name for level in index.levels] assert ind_names == level_names + + +def test_duplicate_level_names_access_raises(idx): + # GH19029 + idx.names = ['foo', 'foo'] + tm.assert_raises_regex(ValueError, 'name foo occurs multiple times', + idx._get_level_number, 'foo')