Skip to content

Commit fe3f7c2

Browse files
h-vetinariaeltanawy
authored andcommitted
TST/CLN: clean up indexes/multi/test_unique_and_duplicates (pandas-dev#21900)
1 parent ea60bfd commit fe3f7c2

File tree

3 files changed

+121
-95
lines changed

3 files changed

+121
-95
lines changed

pandas/tests/indexes/multi/conftest.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,25 @@ def idx():
1515
major_labels = np.array([0, 0, 1, 2, 3, 3])
1616
minor_labels = np.array([0, 1, 0, 1, 0, 1])
1717
index_names = ['first', 'second']
18-
index = MultiIndex(
19-
levels=[major_axis, minor_axis],
20-
labels=[major_labels, minor_labels],
21-
names=index_names,
22-
verify_integrity=False
23-
)
24-
return index
18+
mi = MultiIndex(levels=[major_axis, minor_axis],
19+
labels=[major_labels, minor_labels],
20+
names=index_names, verify_integrity=False)
21+
return mi
22+
23+
24+
@pytest.fixture
25+
def idx_dup():
26+
# compare tests/indexes/multi/conftest.py
27+
major_axis = Index(['foo', 'bar', 'baz', 'qux'])
28+
minor_axis = Index(['one', 'two'])
29+
30+
major_labels = np.array([0, 0, 1, 0, 1, 1])
31+
minor_labels = np.array([0, 1, 0, 1, 0, 1])
32+
index_names = ['first', 'second']
33+
mi = MultiIndex(levels=[major_axis, minor_axis],
34+
labels=[major_labels, minor_labels],
35+
names=index_names, verify_integrity=False)
36+
return mi
2537

2638

2739
@pytest.fixture

pandas/tests/indexes/multi/test_unique_and_duplicates.py renamed to pandas/tests/indexes/multi/test_duplicates.py

+95-88
Original file line numberDiff line numberDiff line change
@@ -2,56 +2,54 @@
22

33
import warnings
44
from itertools import product
5+
import pytest
56

67
import numpy as np
7-
import pandas as pd
8-
import pandas.util.testing as tm
9-
import pytest
10-
from pandas import MultiIndex
8+
119
from pandas.compat import range, u
10+
from pandas import MultiIndex, DatetimeIndex
11+
from pandas._libs import hashtable
12+
import pandas.util.testing as tm
1213

1314

1415
@pytest.mark.parametrize('names', [None, ['first', 'second']])
1516
def test_unique(names):
16-
mi = pd.MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]],
17-
names=names)
17+
mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names)
1818

1919
res = mi.unique()
20-
exp = pd.MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
20+
exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
2121
tm.assert_index_equal(res, exp)
2222

23-
mi = pd.MultiIndex.from_arrays([list('aaaa'), list('abab')],
24-
names=names)
23+
mi = MultiIndex.from_arrays([list('aaaa'), list('abab')],
24+
names=names)
2525
res = mi.unique()
26-
exp = pd.MultiIndex.from_arrays([list('aa'), list('ab')],
27-
names=mi.names)
26+
exp = MultiIndex.from_arrays([list('aa'), list('ab')], names=mi.names)
2827
tm.assert_index_equal(res, exp)
2928

30-
mi = pd.MultiIndex.from_arrays([list('aaaa'), list('aaaa')],
31-
names=names)
29+
mi = MultiIndex.from_arrays([list('aaaa'), list('aaaa')], names=names)
3230
res = mi.unique()
33-
exp = pd.MultiIndex.from_arrays([['a'], ['a']], names=mi.names)
31+
exp = MultiIndex.from_arrays([['a'], ['a']], names=mi.names)
3432
tm.assert_index_equal(res, exp)
3533

3634
# GH #20568 - empty MI
37-
mi = pd.MultiIndex.from_arrays([[], []], names=names)
35+
mi = MultiIndex.from_arrays([[], []], names=names)
3836
res = mi.unique()
3937
tm.assert_index_equal(mi, res)
4038

4139

4240
def test_unique_datetimelike():
43-
idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
44-
'2015-01-01', 'NaT', 'NaT'])
45-
idx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02',
46-
'2015-01-02', 'NaT', '2015-01-01'],
47-
tz='Asia/Tokyo')
48-
result = pd.MultiIndex.from_arrays([idx1, idx2]).unique()
49-
50-
eidx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT'])
51-
eidx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-02',
52-
'NaT', '2015-01-01'],
53-
tz='Asia/Tokyo')
54-
exp = pd.MultiIndex.from_arrays([eidx1, eidx2])
41+
idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
42+
'2015-01-01', 'NaT', 'NaT'])
43+
idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02',
44+
'2015-01-02', 'NaT', '2015-01-01'],
45+
tz='Asia/Tokyo')
46+
result = MultiIndex.from_arrays([idx1, idx2]).unique()
47+
48+
eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT'])
49+
eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02',
50+
'NaT', '2015-01-01'],
51+
tz='Asia/Tokyo')
52+
exp = MultiIndex.from_arrays([eidx1, eidx2])
5553
tm.assert_index_equal(result, exp)
5654

5755

@@ -63,41 +61,51 @@ def test_unique_level(idx, level):
6361
tm.assert_index_equal(result, expected)
6462

6563
# With already unique level
66-
mi = pd.MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]],
67-
names=['first', 'second'])
64+
mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]],
65+
names=['first', 'second'])
6866
result = mi.unique(level=level)
6967
expected = mi.get_level_values(level)
7068
tm.assert_index_equal(result, expected)
7169

7270
# With empty MI
73-
mi = pd.MultiIndex.from_arrays([[], []], names=['first', 'second'])
71+
mi = MultiIndex.from_arrays([[], []], names=['first', 'second'])
7472
result = mi.unique(level=level)
7573
expected = mi.get_level_values(level)
7674

7775

76+
@pytest.mark.parametrize('dropna', [True, False])
77+
def test_get_unique_index(idx, dropna):
78+
mi = idx[[0, 1, 0, 1, 1, 0, 0]]
79+
expected = mi._shallow_copy(mi[[0, 1]])
80+
81+
result = mi._get_unique_index(dropna=dropna)
82+
assert result.unique
83+
tm.assert_index_equal(result, expected)
84+
85+
7886
def test_duplicate_multiindex_labels():
7987
# GH 17464
8088
# Make sure that a MultiIndex with duplicate levels throws a ValueError
8189
with pytest.raises(ValueError):
82-
ind = pd.MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)])
90+
mi = MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)])
8391

8492
# And that using set_levels with duplicate levels fails
85-
ind = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'],
86-
[1, 2, 1, 2, 3]])
93+
mi = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'],
94+
[1, 2, 1, 2, 3]])
8795
with pytest.raises(ValueError):
88-
ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]],
89-
inplace=True)
96+
mi.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]],
97+
inplace=True)
9098

9199

92100
@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
93101
[1, 'a', 1]])
94102
def test_duplicate_level_names(names):
95103
# GH18872, GH19029
96-
mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names)
104+
mi = MultiIndex.from_product([[0, 1]] * 3, names=names)
97105
assert mi.names == names
98106

99107
# With .rename()
100-
mi = pd.MultiIndex.from_product([[0, 1]] * 3)
108+
mi = MultiIndex.from_product([[0, 1]] * 3)
101109
mi = mi.rename(names)
102110
assert mi.names == names
103111

@@ -109,27 +117,34 @@ def test_duplicate_level_names(names):
109117

110118
def test_duplicate_meta_data():
111119
# GH 10115
112-
index = MultiIndex(
120+
mi = MultiIndex(
113121
levels=[[0, 1], [0, 1, 2]],
114122
labels=[[0, 0, 0, 0, 1, 1, 1],
115123
[0, 1, 2, 0, 0, 1, 2]])
116124

117-
for idx in [index,
118-
index.set_names([None, None]),
119-
index.set_names([None, 'Num']),
120-
index.set_names(['Upper', 'Num']), ]:
125+
for idx in [mi,
126+
mi.set_names([None, None]),
127+
mi.set_names([None, 'Num']),
128+
mi.set_names(['Upper', 'Num']), ]:
121129
assert idx.has_duplicates
122130
assert idx.drop_duplicates().names == idx.names
123131

124132

125-
def test_duplicates(idx):
133+
def test_has_duplicates(idx, idx_dup):
134+
# see fixtures
135+
assert idx.is_unique
126136
assert not idx.has_duplicates
127-
assert idx.append(idx).has_duplicates
137+
assert not idx_dup.is_unique
138+
assert idx_dup.has_duplicates
128139

129-
index = MultiIndex(levels=[[0, 1], [0, 1, 2]], labels=[
130-
[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]])
131-
assert index.has_duplicates
140+
mi = MultiIndex(levels=[[0, 1], [0, 1, 2]],
141+
labels=[[0, 0, 0, 0, 1, 1, 1],
142+
[0, 1, 2, 0, 0, 1, 2]])
143+
assert not mi.is_unique
144+
assert mi.has_duplicates
132145

146+
147+
def test_has_duplicates_from_tuples():
133148
# GH 9075
134149
t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169),
135150
(u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119),
@@ -150,9 +165,11 @@ def test_duplicates(idx):
150165
(u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123),
151166
(u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)]
152167

153-
index = pd.MultiIndex.from_tuples(t)
154-
assert not index.has_duplicates
168+
mi = MultiIndex.from_tuples(t)
169+
assert not mi.has_duplicates
170+
155171

172+
def test_has_duplicates_overflow():
156173
# handle int64 overflow if possible
157174
def check(nlevels, with_nulls):
158175
labels = np.tile(np.arange(500), 2)
@@ -171,20 +188,20 @@ def check(nlevels, with_nulls):
171188
levels = [level] * nlevels + [[0, 1]]
172189

173190
# no dups
174-
index = MultiIndex(levels=levels, labels=labels)
175-
assert not index.has_duplicates
191+
mi = MultiIndex(levels=levels, labels=labels)
192+
assert not mi.has_duplicates
176193

177194
# with a dup
178195
if with_nulls:
179196
def f(a):
180197
return np.insert(a, 1000, a[0])
181198
labels = list(map(f, labels))
182-
index = MultiIndex(levels=levels, labels=labels)
199+
mi = MultiIndex(levels=levels, labels=labels)
183200
else:
184-
values = index.values.tolist()
185-
index = MultiIndex.from_tuples(values + [values[0]])
201+
values = mi.values.tolist()
202+
mi = MultiIndex.from_tuples(values + [values[0]])
186203

187-
assert index.has_duplicates
204+
assert mi.has_duplicates
188205

189206
# no overflow
190207
check(4, False)
@@ -194,29 +211,42 @@ def f(a):
194211
check(8, False)
195212
check(8, True)
196213

214+
215+
@pytest.mark.parametrize('keep, expected', [
216+
('first', np.array([False, False, False, True, True, False])),
217+
('last', np.array([False, True, True, False, False, False])),
218+
(False, np.array([False, True, True, True, True, False]))
219+
])
220+
def test_duplicated(idx_dup, keep, expected):
221+
result = idx_dup.duplicated(keep=keep)
222+
tm.assert_numpy_array_equal(result, expected)
223+
224+
225+
@pytest.mark.parametrize('keep', ['first', 'last', False])
226+
def test_duplicated_large(keep):
197227
# GH 9125
198228
n, k = 200, 5000
199229
levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
200230
labels = [np.random.choice(n, k * n) for lev in levels]
201231
mi = MultiIndex(levels=levels, labels=labels)
202232

203-
for keep in ['first', 'last', False]:
204-
left = mi.duplicated(keep=keep)
205-
right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep)
206-
tm.assert_numpy_array_equal(left, right)
233+
result = mi.duplicated(keep=keep)
234+
expected = hashtable.duplicated_object(mi.values, keep=keep)
235+
tm.assert_numpy_array_equal(result, expected)
207236

237+
238+
def test_get_duplicates():
208239
# GH5873
209240
for a in [101, 102]:
210241
mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
211242
assert not mi.has_duplicates
212243

213244
with warnings.catch_warnings(record=True):
214245
# Deprecated - see GH20239
215-
assert mi.get_duplicates().equals(MultiIndex.from_arrays(
216-
[[], []]))
246+
assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))
217247

218-
tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(
219-
2, dtype='bool'))
248+
tm.assert_numpy_array_equal(mi.duplicated(),
249+
np.zeros(2, dtype='bool'))
220250

221251
for n in range(1, 6): # 1st level shape
222252
for m in range(1, 5): # 2nd level shape
@@ -232,28 +262,5 @@ def f(a):
232262
assert mi.get_duplicates().equals(MultiIndex.from_arrays(
233263
[[], []]))
234264

235-
tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(
236-
len(mi), dtype='bool'))
237-
238-
239-
def test_get_unique_index(idx):
240-
idx = idx[[0, 1, 0, 1, 1, 0, 0]]
241-
expected = idx._shallow_copy(idx[[0, 1]])
242-
243-
for dropna in [False, True]:
244-
result = idx._get_unique_index(dropna=dropna)
245-
assert result.unique
246-
tm.assert_index_equal(result, expected)
247-
248-
249-
def test_unique_na():
250-
idx = pd.Index([2, np.nan, 2, 1], name='my_index')
251-
expected = pd.Index([2, np.nan, 1], name='my_index')
252-
result = idx.unique()
253-
tm.assert_index_equal(result, expected)
254-
255-
256-
def test_duplicate_level_names_access_raises(idx):
257-
idx.names = ['foo', 'foo']
258-
tm.assert_raises_regex(ValueError, 'name foo occurs multiple times',
259-
idx._get_level_number, 'foo')
265+
tm.assert_numpy_array_equal(mi.duplicated(),
266+
np.zeros(len(mi), dtype='bool'))

pandas/tests/indexes/multi/test_names.py

+7
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,10 @@ def test_names(idx, index_names):
115115
ind_names = list(index.names)
116116
level_names = [level.name for level in index.levels]
117117
assert ind_names == level_names
118+
119+
120+
def test_duplicate_level_names_access_raises(idx):
121+
# GH19029
122+
idx.names = ['foo', 'foo']
123+
tm.assert_raises_regex(ValueError, 'name foo occurs multiple times',
124+
idx._get_level_number, 'foo')

0 commit comments

Comments
 (0)