25
25
from pandas .util .testing import assert_almost_equal
26
26
27
27
28
+ def assert_series_or_index_or_array_or_categorical_equal (left , right ):
29
+ if isinstance (left , Series ):
30
+ tm .assert_series_equal (left , right )
31
+ elif isinstance (left , Index ):
32
+ tm .assert_index_equal (left , right )
33
+ elif isinstance (left , np .ndarray ):
34
+ tm .assert_numpy_array_equal (left , right )
35
+ elif isinstance (left , Categorical ):
36
+ tm .assert_categorical_equal (left , right )
37
+ else :
38
+ # will fail
39
+ assert isinstance (left , (Series , Index , np .ndarray , Categorical ))
40
+
41
+
28
42
class TestMatch (object ):
29
43
30
44
def test_ints (self ):
@@ -321,17 +335,22 @@ def test_parametrized_factorize_na_value(self, data, na_value):
321
335
322
336
class TestUnique (object ):
323
337
324
- def test_ints (self ):
325
- arr = np .random .randint (0 , 100 , size = 50 )
338
+ def test_unique_inverse (self , any_numpy_dtype ):
339
+ dtype = any_numpy_dtype
340
+ arr = np .random .randint (0 , 100 , size = 50 ).astype (dtype )
326
341
327
342
result = algos .unique (arr )
328
343
assert isinstance (result , np .ndarray )
329
344
330
- def test_objects ( self ):
331
- arr = np . random . randint ( 0 , 100 , size = 50 ). astype ( 'O' )
345
+ # reuse result as expected outcome of return_inverse case
346
+ expected_uniques = result . copy ( )
332
347
333
- result = algos .unique (arr )
334
- assert isinstance (result , np .ndarray )
348
+ result_uniques , result_inverse = algos .unique (arr , return_inverse = True )
349
+ tm .assert_numpy_array_equal (result_uniques , expected_uniques )
350
+
351
+ # reconstruction can only work if inverse is correct
352
+ reconstr = result_uniques [result_inverse ]
353
+ tm .assert_numpy_array_equal (reconstr , arr , check_dtype = False )
335
354
336
355
def test_object_refcount_bug (self ):
337
356
lst = ['A' , 'B' , 'C' , 'D' , 'E' ]
@@ -376,24 +395,26 @@ def test_datetime64_dtype_array_returned(self):
376
395
tm .assert_numpy_array_equal (result , expected )
377
396
assert result .dtype == expected .dtype
378
397
379
- def test_timedelta64_dtype_array_returned (self ):
398
+ @pytest .mark .parametrize ('box' , [Index , Series , np .array ])
399
+ def test_timedelta64_dtype_array_returned (self , box ):
380
400
# GH 9431
381
401
expected = np .array ([31200 , 45678 , 10000 ], dtype = 'm8[ns]' )
382
402
383
403
td_index = pd .to_timedelta ([31200 , 45678 , 31200 , 10000 , 45678 ])
384
- result = algos .unique (td_index )
385
- tm .assert_numpy_array_equal (result , expected )
386
- assert result .dtype == expected .dtype
404
+ obj = box (td_index )
387
405
388
- s = Series (td_index )
389
- result = algos .unique (s )
406
+ result = algos .unique (obj )
390
407
tm .assert_numpy_array_equal (result , expected )
391
- assert result .dtype == expected .dtype
392
408
393
- arr = s .values
394
- result = algos .unique (arr )
395
- tm .assert_numpy_array_equal (result , expected )
396
- assert result .dtype == expected .dtype
409
+ # reuse result as expected outcome of return_inverse case
410
+ expected_uniques = result .copy ()
411
+
412
+ result_uniques , result_inverse = algos .unique (obj , return_inverse = True )
413
+ tm .assert_numpy_array_equal (result_uniques , expected_uniques )
414
+
415
+ # reconstruction can only work if inverse is correct
416
+ reconstr = box (result_uniques [result_inverse ])
417
+ assert_series_or_index_or_array_or_categorical_equal (reconstr , obj )
397
418
398
419
def test_uint64_overflow (self ):
399
420
s = Series ([1 , 2 , 2 ** 63 , 2 ** 63 ], dtype = np .uint64 )
@@ -406,78 +427,80 @@ def test_nan_in_object_array(self):
406
427
expected = np .array (['a' , np .nan , 'c' ], dtype = object )
407
428
tm .assert_numpy_array_equal (result , expected )
408
429
409
- def test_categorical (self ):
430
+ result_uniques , result_inverse = pd .unique (duplicated_items ,
431
+ return_inverse = True )
432
+ expected_inverse = np .array ([0 , 1 , 2 , 2 ], dtype = 'int64' )
433
+ tm .assert_numpy_array_equal (result_inverse , expected_inverse )
434
+
435
+ @pytest .mark .parametrize ('ordered' , [True , False ])
436
+ @pytest .mark .parametrize ('box' , [lambda x : x , Series , Index ],
437
+ ids = ['Categorical' , 'Series' , 'Index' ])
438
+ @pytest .mark .parametrize ('method' , [lambda x , ** kwargs : x .unique (** kwargs ),
439
+ pd .unique ],
440
+ ids = ['classmethod' , 'toplevel' ])
441
+ def test_categorical (self , method , box , ordered ):
410
442
411
- # we are expecting to return in the order
412
- # of appearance
413
- expected = Categorical ( list ( 'bac' ), categories = list ( 'bac' ) )
443
+ categories = list ( 'abc' ) if ordered else list ( 'bac' )
444
+ expected = Categorical ( list ( 'bac' ), categories = categories ,
445
+ ordered = ordered )
414
446
415
- # we are expecting to return in the order
416
- # of the categories
417
- expected_o = Categorical (
418
- list ('bac' ), categories = list ('abc' ), ordered = True )
447
+ # Index.unique always returns Index
448
+ # pd.unique(Index) stays Index (only) for Categorical
449
+ expected = box (expected ) if box == Index else expected
419
450
420
451
# GH 15939
421
- c = Categorical (list ('baabc' ))
422
- result = c . unique ( )
423
- tm . assert_categorical_equal ( result , expected )
452
+ c = box ( Categorical (list ('baabc' ), categories = categories ,
453
+ ordered = ordered ) )
454
+ result = method ( c )
424
455
425
- result = algos .unique (c )
426
- tm .assert_categorical_equal (result , expected )
456
+ assert_series_or_index_or_array_or_categorical_equal (result , expected )
427
457
428
- c = Categorical (list ('baabc' ), ordered = True )
429
- result = c .unique ()
430
- tm .assert_categorical_equal (result , expected_o )
458
+ if method == pd .unique :
459
+ # [Series/Index].unique do not yet support return_inverse=True
431
460
432
- result = algos .unique (c )
433
- tm .assert_categorical_equal (result , expected_o )
461
+ # reuse result as expected outcome of return_inverse case
462
+ expected_uniques = result .copy ()
463
+ result_uniques , result_inverse = method (c , return_inverse = True )
434
464
435
- # Series of categorical dtype
436
- s = Series (Categorical (list ('baabc' )), name = 'foo' )
437
- result = s .unique ()
438
- tm .assert_categorical_equal (result , expected )
465
+ assert_series_or_index_or_array_or_categorical_equal (
466
+ result_uniques , expected_uniques )
439
467
440
- result = pd .unique (s )
441
- tm .assert_categorical_equal (result , expected )
468
+ # reconstruction can only work if inverse is correct
469
+ reconstr = box (result_uniques [result_inverse ])
470
+ assert_series_or_index_or_array_or_categorical_equal (reconstr , c )
442
471
443
- # CI -> return CI
444
- ci = CategoricalIndex ( Categorical ( list ( 'baabc' ),
445
- categories = list ( 'bac' )))
446
- expected = CategoricalIndex ( expected )
447
- result = ci . unique ()
448
- tm . assert_index_equal ( result , expected )
472
+ @ pytest . mark . parametrize ( 'box' , [ Series , Index ])
473
+ @ pytest . mark . parametrize ( 'method' , [ lambda x , ** kwargs : x . unique ( ** kwargs ),
474
+ pd . unique ],
475
+ ids = [ 'classmethod' , 'toplevel' ] )
476
+ def test_datetime64tz_aware ( self , method , box ):
477
+ # GH 15939
449
478
450
- result = pd . unique ( ci )
451
- tm . assert_index_equal ( result , expected )
479
+ ts = Timestamp ( '20160101' , tz = 'US/Eastern' )
480
+ obj = box ([ ts , ts ] )
452
481
453
- def test_datetime64tz_aware (self ):
454
- # GH 15939
482
+ if box == Series :
483
+ expected = np .array ([Timestamp ('2016-01-01 00:00:00-0500' ,
484
+ tz = 'US/Eastern' )], dtype = object )
485
+ else : # Index
486
+ expected = Index ([ts ])
455
487
456
- result = Series (
457
- Index ([Timestamp ('20160101' , tz = 'US/Eastern' ),
458
- Timestamp ('20160101' , tz = 'US/Eastern' )])).unique ()
459
- expected = np .array ([Timestamp ('2016-01-01 00:00:00-0500' ,
460
- tz = 'US/Eastern' )], dtype = object )
461
- tm .assert_numpy_array_equal (result , expected )
488
+ result = method (obj )
489
+ assert_series_or_index_or_array_or_categorical_equal (result , expected )
462
490
463
- result = Index ([Timestamp ('20160101' , tz = 'US/Eastern' ),
464
- Timestamp ('20160101' , tz = 'US/Eastern' )]).unique ()
465
- expected = DatetimeIndex (['2016-01-01 00:00:00' ],
466
- dtype = 'datetime64[ns, US/Eastern]' , freq = None )
467
- tm .assert_index_equal (result , expected )
468
-
469
- result = pd .unique (
470
- Series (Index ([Timestamp ('20160101' , tz = 'US/Eastern' ),
471
- Timestamp ('20160101' , tz = 'US/Eastern' )])))
472
- expected = np .array ([Timestamp ('2016-01-01 00:00:00-0500' ,
473
- tz = 'US/Eastern' )], dtype = object )
474
- tm .assert_numpy_array_equal (result , expected )
491
+ if method == pd .unique :
492
+ # [Series/Index].unique do not yet support return_inverse=True
493
+
494
+ # reuse result as expected outcome of return_inverse case
495
+ expected_uniques = result .copy ()
496
+ result_uniques , result_inverse = method (obj , return_inverse = True )
475
497
476
- result = pd .unique (Index ([Timestamp ('20160101' , tz = 'US/Eastern' ),
477
- Timestamp ('20160101' , tz = 'US/Eastern' )]))
478
- expected = DatetimeIndex (['2016-01-01 00:00:00' ],
479
- dtype = 'datetime64[ns, US/Eastern]' , freq = None )
480
- tm .assert_index_equal (result , expected )
498
+ assert_series_or_index_or_array_or_categorical_equal (
499
+ result_uniques , expected_uniques )
500
+
501
+ # reconstruction can only work if inverse is correct
502
+ reconstr = box (result_uniques [result_inverse ])
503
+ assert_series_or_index_or_array_or_categorical_equal (reconstr , obj )
481
504
482
505
def test_order_of_appearance (self ):
483
506
# 9346
@@ -491,28 +514,10 @@ def test_order_of_appearance(self):
491
514
tm .assert_numpy_array_equal (result ,
492
515
np .array ([2 , 1 ], dtype = 'int64' ))
493
516
494
- result = pd .unique (Series ([Timestamp ('20160101' ),
495
- Timestamp ('20160101' )]))
496
- expected = np .array (['2016-01-01T00:00:00.000000000' ],
497
- dtype = 'datetime64[ns]' )
498
- tm .assert_numpy_array_equal (result , expected )
499
-
500
- result = pd .unique (Index (
501
- [Timestamp ('20160101' , tz = 'US/Eastern' ),
502
- Timestamp ('20160101' , tz = 'US/Eastern' )]))
503
- expected = DatetimeIndex (['2016-01-01 00:00:00' ],
504
- dtype = 'datetime64[ns, US/Eastern]' ,
505
- freq = None )
506
- tm .assert_index_equal (result , expected )
507
-
508
517
result = pd .unique (list ('aabc' ))
509
518
expected = np .array (['a' , 'b' , 'c' ], dtype = object )
510
519
tm .assert_numpy_array_equal (result , expected )
511
520
512
- result = pd .unique (Series (Categorical (list ('aabc' ))))
513
- expected = Categorical (list ('abc' ))
514
- tm .assert_categorical_equal (result , expected )
515
-
516
521
@pytest .mark .parametrize ("arg ,expected" , [
517
522
(('1' , '1' , '2' ), np .array (['1' , '2' ], dtype = object )),
518
523
(('foo' ,), np .array (['foo' ], dtype = object ))
0 commit comments