2
2
3
3
import warnings
4
4
from itertools import product
5
+ import pytest
5
6
6
7
import numpy as np
7
- import pandas as pd
8
- import pandas .util .testing as tm
9
- import pytest
10
- from pandas import MultiIndex
8
+
11
9
from pandas .compat import range , u
10
+ from pandas import MultiIndex , DatetimeIndex
11
+ from pandas ._libs import hashtable
12
+ import pandas .util .testing as tm
12
13
13
14
14
15
@pytest .mark .parametrize ('names' , [None , ['first' , 'second' ]])
15
16
def test_unique (names ):
16
- mi = pd .MultiIndex .from_arrays ([[1 , 2 , 1 , 2 ], [1 , 1 , 1 , 2 ]],
17
- names = names )
17
+ mi = MultiIndex .from_arrays ([[1 , 2 , 1 , 2 ], [1 , 1 , 1 , 2 ]], names = names )
18
18
19
19
res = mi .unique ()
20
- exp = pd . MultiIndex .from_arrays ([[1 , 2 , 2 ], [1 , 1 , 2 ]], names = mi .names )
20
+ exp = MultiIndex .from_arrays ([[1 , 2 , 2 ], [1 , 1 , 2 ]], names = mi .names )
21
21
tm .assert_index_equal (res , exp )
22
22
23
- mi = pd . MultiIndex .from_arrays ([list ('aaaa' ), list ('abab' )],
24
- names = names )
23
+ mi = MultiIndex .from_arrays ([list ('aaaa' ), list ('abab' )],
24
+ names = names )
25
25
res = mi .unique ()
26
- exp = pd .MultiIndex .from_arrays ([list ('aa' ), list ('ab' )],
27
- names = mi .names )
26
+ exp = MultiIndex .from_arrays ([list ('aa' ), list ('ab' )], names = mi .names )
28
27
tm .assert_index_equal (res , exp )
29
28
30
- mi = pd .MultiIndex .from_arrays ([list ('aaaa' ), list ('aaaa' )],
31
- names = names )
29
+ mi = MultiIndex .from_arrays ([list ('aaaa' ), list ('aaaa' )], names = names )
32
30
res = mi .unique ()
33
- exp = pd . MultiIndex .from_arrays ([['a' ], ['a' ]], names = mi .names )
31
+ exp = MultiIndex .from_arrays ([['a' ], ['a' ]], names = mi .names )
34
32
tm .assert_index_equal (res , exp )
35
33
36
34
# GH #20568 - empty MI
37
- mi = pd . MultiIndex .from_arrays ([[], []], names = names )
35
+ mi = MultiIndex .from_arrays ([[], []], names = names )
38
36
res = mi .unique ()
39
37
tm .assert_index_equal (mi , res )
40
38
41
39
42
40
def test_unique_datetimelike ():
43
- idx1 = pd . DatetimeIndex (['2015-01-01' , '2015-01-01' , '2015-01-01' ,
44
- '2015-01-01' , 'NaT' , 'NaT' ])
45
- idx2 = pd . DatetimeIndex (['2015-01-01' , '2015-01-01' , '2015-01-02' ,
46
- '2015-01-02' , 'NaT' , '2015-01-01' ],
47
- tz = 'Asia/Tokyo' )
48
- result = pd . MultiIndex .from_arrays ([idx1 , idx2 ]).unique ()
49
-
50
- eidx1 = pd . DatetimeIndex (['2015-01-01' , '2015-01-01' , 'NaT' , 'NaT' ])
51
- eidx2 = pd . DatetimeIndex (['2015-01-01' , '2015-01-02' ,
52
- 'NaT' , '2015-01-01' ],
53
- tz = 'Asia/Tokyo' )
54
- exp = pd . MultiIndex .from_arrays ([eidx1 , eidx2 ])
41
+ idx1 = DatetimeIndex (['2015-01-01' , '2015-01-01' , '2015-01-01' ,
42
+ '2015-01-01' , 'NaT' , 'NaT' ])
43
+ idx2 = DatetimeIndex (['2015-01-01' , '2015-01-01' , '2015-01-02' ,
44
+ '2015-01-02' , 'NaT' , '2015-01-01' ],
45
+ tz = 'Asia/Tokyo' )
46
+ result = MultiIndex .from_arrays ([idx1 , idx2 ]).unique ()
47
+
48
+ eidx1 = DatetimeIndex (['2015-01-01' , '2015-01-01' , 'NaT' , 'NaT' ])
49
+ eidx2 = DatetimeIndex (['2015-01-01' , '2015-01-02' ,
50
+ 'NaT' , '2015-01-01' ],
51
+ tz = 'Asia/Tokyo' )
52
+ exp = MultiIndex .from_arrays ([eidx1 , eidx2 ])
55
53
tm .assert_index_equal (result , exp )
56
54
57
55
@@ -63,41 +61,51 @@ def test_unique_level(idx, level):
63
61
tm .assert_index_equal (result , expected )
64
62
65
63
# With already unique level
66
- mi = pd . MultiIndex .from_arrays ([[1 , 3 , 2 , 4 ], [1 , 3 , 2 , 5 ]],
67
- names = ['first' , 'second' ])
64
+ mi = MultiIndex .from_arrays ([[1 , 3 , 2 , 4 ], [1 , 3 , 2 , 5 ]],
65
+ names = ['first' , 'second' ])
68
66
result = mi .unique (level = level )
69
67
expected = mi .get_level_values (level )
70
68
tm .assert_index_equal (result , expected )
71
69
72
70
# With empty MI
73
- mi = pd . MultiIndex .from_arrays ([[], []], names = ['first' , 'second' ])
71
+ mi = MultiIndex .from_arrays ([[], []], names = ['first' , 'second' ])
74
72
result = mi .unique (level = level )
75
73
expected = mi .get_level_values (level )
76
74
77
75
76
+ @pytest .mark .parametrize ('dropna' , [True , False ])
77
+ def test_get_unique_index (idx , dropna ):
78
+ mi = idx [[0 , 1 , 0 , 1 , 1 , 0 , 0 ]]
79
+ expected = mi ._shallow_copy (mi [[0 , 1 ]])
80
+
81
+ result = mi ._get_unique_index (dropna = dropna )
82
+ assert result .unique
83
+ tm .assert_index_equal (result , expected )
84
+
85
+
78
86
def test_duplicate_multiindex_labels ():
79
87
# GH 17464
80
88
# Make sure that a MultiIndex with duplicate levels throws a ValueError
81
89
with pytest .raises (ValueError ):
82
- ind = pd . MultiIndex ([['A' ] * 10 , range (10 )], [[0 ] * 10 , range (10 )])
90
+ mi = MultiIndex ([['A' ] * 10 , range (10 )], [[0 ] * 10 , range (10 )])
83
91
84
92
# And that using set_levels with duplicate levels fails
85
- ind = MultiIndex .from_arrays ([['A' , 'A' , 'B' , 'B' , 'B' ],
86
- [1 , 2 , 1 , 2 , 3 ]])
93
+ mi = MultiIndex .from_arrays ([['A' , 'A' , 'B' , 'B' , 'B' ],
94
+ [1 , 2 , 1 , 2 , 3 ]])
87
95
with pytest .raises (ValueError ):
88
- ind .set_levels ([['A' , 'B' , 'A' , 'A' , 'B' ], [2 , 1 , 3 , - 2 , 5 ]],
89
- inplace = True )
96
+ mi .set_levels ([['A' , 'B' , 'A' , 'A' , 'B' ], [2 , 1 , 3 , - 2 , 5 ]],
97
+ inplace = True )
90
98
91
99
92
100
@pytest .mark .parametrize ('names' , [['a' , 'b' , 'a' ], [1 , 1 , 2 ],
93
101
[1 , 'a' , 1 ]])
94
102
def test_duplicate_level_names (names ):
95
103
# GH18872, GH19029
96
- mi = pd . MultiIndex .from_product ([[0 , 1 ]] * 3 , names = names )
104
+ mi = MultiIndex .from_product ([[0 , 1 ]] * 3 , names = names )
97
105
assert mi .names == names
98
106
99
107
# With .rename()
100
- mi = pd . MultiIndex .from_product ([[0 , 1 ]] * 3 )
108
+ mi = MultiIndex .from_product ([[0 , 1 ]] * 3 )
101
109
mi = mi .rename (names )
102
110
assert mi .names == names
103
111
@@ -109,27 +117,34 @@ def test_duplicate_level_names(names):
109
117
110
118
def test_duplicate_meta_data ():
111
119
# GH 10115
112
- index = MultiIndex (
120
+ mi = MultiIndex (
113
121
levels = [[0 , 1 ], [0 , 1 , 2 ]],
114
122
labels = [[0 , 0 , 0 , 0 , 1 , 1 , 1 ],
115
123
[0 , 1 , 2 , 0 , 0 , 1 , 2 ]])
116
124
117
- for idx in [index ,
118
- index .set_names ([None , None ]),
119
- index .set_names ([None , 'Num' ]),
120
- index .set_names (['Upper' , 'Num' ]), ]:
125
+ for idx in [mi ,
126
+ mi .set_names ([None , None ]),
127
+ mi .set_names ([None , 'Num' ]),
128
+ mi .set_names (['Upper' , 'Num' ]), ]:
121
129
assert idx .has_duplicates
122
130
assert idx .drop_duplicates ().names == idx .names
123
131
124
132
125
- def test_duplicates (idx ):
133
+ def test_has_duplicates (idx , idx_dup ):
134
+ # see fixtures
135
+ assert idx .is_unique
126
136
assert not idx .has_duplicates
127
- assert idx .append (idx ).has_duplicates
137
+ assert not idx_dup .is_unique
138
+ assert idx_dup .has_duplicates
128
139
129
- index = MultiIndex (levels = [[0 , 1 ], [0 , 1 , 2 ]], labels = [
130
- [0 , 0 , 0 , 0 , 1 , 1 , 1 ], [0 , 1 , 2 , 0 , 0 , 1 , 2 ]])
131
- assert index .has_duplicates
140
+ mi = MultiIndex (levels = [[0 , 1 ], [0 , 1 , 2 ]],
141
+ labels = [[0 , 0 , 0 , 0 , 1 , 1 , 1 ],
142
+ [0 , 1 , 2 , 0 , 0 , 1 , 2 ]])
143
+ assert not mi .is_unique
144
+ assert mi .has_duplicates
132
145
146
+
147
+ def test_has_duplicates_from_tuples ():
133
148
# GH 9075
134
149
t = [(u ('x' ), u ('out' ), u ('z' ), 5 , u ('y' ), u ('in' ), u ('z' ), 169 ),
135
150
(u ('x' ), u ('out' ), u ('z' ), 7 , u ('y' ), u ('in' ), u ('z' ), 119 ),
@@ -150,9 +165,11 @@ def test_duplicates(idx):
150
165
(u ('x' ), u ('out' ), u ('z' ), 33 , u ('y' ), u ('in' ), u ('z' ), 123 ),
151
166
(u ('x' ), u ('out' ), u ('z' ), 12 , u ('y' ), u ('in' ), u ('z' ), 144 )]
152
167
153
- index = pd .MultiIndex .from_tuples (t )
154
- assert not index .has_duplicates
168
+ mi = MultiIndex .from_tuples (t )
169
+ assert not mi .has_duplicates
170
+
155
171
172
+ def test_has_duplicates_overflow ():
156
173
# handle int64 overflow if possible
157
174
def check (nlevels , with_nulls ):
158
175
labels = np .tile (np .arange (500 ), 2 )
@@ -171,20 +188,20 @@ def check(nlevels, with_nulls):
171
188
levels = [level ] * nlevels + [[0 , 1 ]]
172
189
173
190
# no dups
174
- index = MultiIndex (levels = levels , labels = labels )
175
- assert not index .has_duplicates
191
+ mi = MultiIndex (levels = levels , labels = labels )
192
+ assert not mi .has_duplicates
176
193
177
194
# with a dup
178
195
if with_nulls :
179
196
def f (a ):
180
197
return np .insert (a , 1000 , a [0 ])
181
198
labels = list (map (f , labels ))
182
- index = MultiIndex (levels = levels , labels = labels )
199
+ mi = MultiIndex (levels = levels , labels = labels )
183
200
else :
184
- values = index .values .tolist ()
185
- index = MultiIndex .from_tuples (values + [values [0 ]])
201
+ values = mi .values .tolist ()
202
+ mi = MultiIndex .from_tuples (values + [values [0 ]])
186
203
187
- assert index .has_duplicates
204
+ assert mi .has_duplicates
188
205
189
206
# no overflow
190
207
check (4 , False )
@@ -194,29 +211,42 @@ def f(a):
194
211
check (8 , False )
195
212
check (8 , True )
196
213
214
+
215
+ @pytest .mark .parametrize ('keep, expected' , [
216
+ ('first' , np .array ([False , False , False , True , True , False ])),
217
+ ('last' , np .array ([False , True , True , False , False , False ])),
218
+ (False , np .array ([False , True , True , True , True , False ]))
219
+ ])
220
+ def test_duplicated (idx_dup , keep , expected ):
221
+ result = idx_dup .duplicated (keep = keep )
222
+ tm .assert_numpy_array_equal (result , expected )
223
+
224
+
225
+ @pytest .mark .parametrize ('keep' , ['first' , 'last' , False ])
226
+ def test_duplicated_large (keep ):
197
227
# GH 9125
198
228
n , k = 200 , 5000
199
229
levels = [np .arange (n ), tm .makeStringIndex (n ), 1000 + np .arange (n )]
200
230
labels = [np .random .choice (n , k * n ) for lev in levels ]
201
231
mi = MultiIndex (levels = levels , labels = labels )
202
232
203
- for keep in ['first' , 'last' , False ]:
204
- left = mi .duplicated (keep = keep )
205
- right = pd ._libs .hashtable .duplicated_object (mi .values , keep = keep )
206
- tm .assert_numpy_array_equal (left , right )
233
+ result = mi .duplicated (keep = keep )
234
+ expected = hashtable .duplicated_object (mi .values , keep = keep )
235
+ tm .assert_numpy_array_equal (result , expected )
207
236
237
+
238
+ def test_get_duplicates ():
208
239
# GH5873
209
240
for a in [101 , 102 ]:
210
241
mi = MultiIndex .from_arrays ([[101 , a ], [3.5 , np .nan ]])
211
242
assert not mi .has_duplicates
212
243
213
244
with warnings .catch_warnings (record = True ):
214
245
# Deprecated - see GH20239
215
- assert mi .get_duplicates ().equals (MultiIndex .from_arrays (
216
- [[], []]))
246
+ assert mi .get_duplicates ().equals (MultiIndex .from_arrays ([[], []]))
217
247
218
- tm .assert_numpy_array_equal (mi .duplicated (), np . zeros (
219
- 2 , dtype = 'bool' ))
248
+ tm .assert_numpy_array_equal (mi .duplicated (),
249
+ np . zeros ( 2 , dtype = 'bool' ))
220
250
221
251
for n in range (1 , 6 ): # 1st level shape
222
252
for m in range (1 , 5 ): # 2nd level shape
@@ -232,28 +262,5 @@ def f(a):
232
262
assert mi .get_duplicates ().equals (MultiIndex .from_arrays (
233
263
[[], []]))
234
264
235
- tm .assert_numpy_array_equal (mi .duplicated (), np .zeros (
236
- len (mi ), dtype = 'bool' ))
237
-
238
-
239
- def test_get_unique_index (idx ):
240
- idx = idx [[0 , 1 , 0 , 1 , 1 , 0 , 0 ]]
241
- expected = idx ._shallow_copy (idx [[0 , 1 ]])
242
-
243
- for dropna in [False , True ]:
244
- result = idx ._get_unique_index (dropna = dropna )
245
- assert result .unique
246
- tm .assert_index_equal (result , expected )
247
-
248
-
249
- def test_unique_na ():
250
- idx = pd .Index ([2 , np .nan , 2 , 1 ], name = 'my_index' )
251
- expected = pd .Index ([2 , np .nan , 1 ], name = 'my_index' )
252
- result = idx .unique ()
253
- tm .assert_index_equal (result , expected )
254
-
255
-
256
- def test_duplicate_level_names_access_raises (idx ):
257
- idx .names = ['foo' , 'foo' ]
258
- tm .assert_raises_regex (ValueError , 'name foo occurs multiple times' ,
259
- idx ._get_level_number , 'foo' )
265
+ tm .assert_numpy_array_equal (mi .duplicated (),
266
+ np .zeros (len (mi ), dtype = 'bool' ))
0 commit comments