|
3 | 3 | import numpy as np
|
4 | 4 | import pytest
|
5 | 5 |
|
6 |
| -from pandas._libs import hashtable |
| 6 | +from pandas._libs import ( |
| 7 | + hashtable, |
| 8 | + index as libindex, |
| 9 | +) |
7 | 10 |
|
8 | 11 | from pandas import (
|
9 | 12 | NA,
|
@@ -232,41 +235,43 @@ def test_duplicated(idx_dup, keep, expected):
|
232 | 235 |
|
233 | 236 |
|
234 | 237 | @pytest.mark.arm_slow
|
235 |
| -def test_duplicated_large(keep): |
| 238 | +def test_duplicated_hashtable_impl(keep, monkeypatch): |
236 | 239 | # GH 9125
|
237 |
| - n, k = 200, 5000 |
| 240 | + n, k = 6, 10 |
238 | 241 | levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
|
239 |
| - codes = [np.random.choice(n, k * n) for lev in levels] |
240 |
| - mi = MultiIndex(levels=levels, codes=codes) |
| 242 | + codes = [np.random.choice(n, k * n) for _ in levels] |
| 243 | + with monkeypatch.context() as m: |
| 244 | + m.setattr(libindex, "_SIZE_CUTOFF", 50) |
| 245 | + mi = MultiIndex(levels=levels, codes=codes) |
241 | 246 |
|
242 |
| - result = mi.duplicated(keep=keep) |
243 |
| - expected = hashtable.duplicated(mi.values, keep=keep) |
| 247 | + result = mi.duplicated(keep=keep) |
| 248 | + expected = hashtable.duplicated(mi.values, keep=keep) |
244 | 249 | tm.assert_numpy_array_equal(result, expected)
|
245 | 250 |
|
246 | 251 |
|
247 |
| -def test_duplicated2(): |
248 |
| - # TODO: more informative test name |
| 252 | +@pytest.mark.parametrize("val", [101, 102]) |
| 253 | +def test_duplicated_with_nan(val): |
| 254 | + # GH5873 |
| 255 | + mi = MultiIndex.from_arrays([[101, val], [3.5, np.nan]]) |
| 256 | + assert not mi.has_duplicates |
| 257 | + |
| 258 | + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) |
| 259 | + |
| 260 | + |
| 261 | +@pytest.mark.parametrize("n", range(1, 6)) |
| 262 | +@pytest.mark.parametrize("m", range(1, 5)) |
| 263 | +def test_duplicated_with_nan_multi_shape(n, m): |
249 | 264 | # GH5873
|
250 |
| - for a in [101, 102]: |
251 |
| - mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) |
252 |
| - assert not mi.has_duplicates |
253 |
| - |
254 |
| - tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) |
255 |
| - |
256 |
| - for n in range(1, 6): # 1st level shape |
257 |
| - for m in range(1, 5): # 2nd level shape |
258 |
| - # all possible unique combinations, including nan |
259 |
| - codes = product(range(-1, n), range(-1, m)) |
260 |
| - mi = MultiIndex( |
261 |
| - levels=[list("abcde")[:n], list("WXYZ")[:m]], |
262 |
| - codes=np.random.permutation(list(codes)).T, |
263 |
| - ) |
264 |
| - assert len(mi) == (n + 1) * (m + 1) |
265 |
| - assert not mi.has_duplicates |
266 |
| - |
267 |
| - tm.assert_numpy_array_equal( |
268 |
| - mi.duplicated(), np.zeros(len(mi), dtype="bool") |
269 |
| - ) |
| 265 | + # all possible unique combinations, including nan |
| 266 | + codes = product(range(-1, n), range(-1, m)) |
| 267 | + mi = MultiIndex( |
| 268 | + levels=[list("abcde")[:n], list("WXYZ")[:m]], |
| 269 | + codes=np.random.permutation(list(codes)).T, |
| 270 | + ) |
| 271 | + assert len(mi) == (n + 1) * (m + 1) |
| 272 | + assert not mi.has_duplicates |
| 273 | + |
| 274 | + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype="bool")) |
270 | 275 |
|
271 | 276 |
|
272 | 277 | def test_duplicated_drop_duplicates():
|
|
0 commit comments