23
23
from pandas import (
24
24
Categorical ,
25
25
CategoricalIndex ,
26
+ DataFrame ,
26
27
DatetimeIndex ,
27
28
Index ,
28
29
IntervalIndex ,
30
+ MultiIndex ,
31
+ NaT ,
32
+ Period ,
33
+ PeriodIndex ,
29
34
Series ,
35
+ Timedelta ,
30
36
Timestamp ,
37
+ date_range ,
38
+ timedelta_range ,
39
+ to_datetime ,
40
+ to_timedelta ,
31
41
)
32
42
import pandas ._testing as tm
33
43
import pandas .core .algorithms as algos
36
46
37
47
38
48
class TestFactorize :
49
+ @pytest .mark .parametrize ("sort" , [True , False ])
50
+ def test_factorize (self , index_or_series_obj , sort ):
51
+ obj = index_or_series_obj
52
+ result_codes , result_uniques = obj .factorize (sort = sort )
53
+
54
+ constructor = Index
55
+ if isinstance (obj , MultiIndex ):
56
+ constructor = MultiIndex .from_tuples
57
+ expected_uniques = constructor (obj .unique ())
58
+
59
+ if sort :
60
+ expected_uniques = expected_uniques .sort_values ()
61
+
62
+ # construct an integer ndarray so that
63
+ # `expected_uniques.take(expected_codes)` is equal to `obj`
64
+ expected_uniques_list = list (expected_uniques )
65
+ expected_codes = [expected_uniques_list .index (val ) for val in obj ]
66
+ expected_codes = np .asarray (expected_codes , dtype = np .intp )
67
+
68
+ tm .assert_numpy_array_equal (result_codes , expected_codes )
69
+ tm .assert_index_equal (result_uniques , expected_uniques )
70
+
71
+ def test_series_factorize_na_sentinel_none (self ):
72
+ # GH#35667
73
+ values = np .array ([1 , 2 , 1 , np .nan ])
74
+ ser = Series (values )
75
+ codes , uniques = ser .factorize (na_sentinel = None )
76
+
77
+ expected_codes = np .array ([0 , 1 , 0 , 2 ], dtype = np .intp )
78
+ expected_uniques = Index ([1.0 , 2.0 , np .nan ])
79
+
80
+ tm .assert_numpy_array_equal (codes , expected_codes )
81
+ tm .assert_index_equal (uniques , expected_uniques )
82
+
39
83
def test_basic (self ):
40
84
41
85
codes , uniques = algos .factorize (["a" , "b" , "b" , "a" , "a" , "c" , "c" , "c" ])
@@ -111,34 +155,34 @@ def test_datelike(self):
111
155
tm .assert_index_equal (uniques , exp )
112
156
113
157
# period
114
- v1 = pd . Period ("201302" , freq = "M" )
115
- v2 = pd . Period ("201303" , freq = "M" )
158
+ v1 = Period ("201302" , freq = "M" )
159
+ v2 = Period ("201303" , freq = "M" )
116
160
x = Series ([v1 , v1 , v1 , v2 , v2 , v1 ])
117
161
118
162
# periods are not 'sorted' as they are converted back into an index
119
163
codes , uniques = algos .factorize (x )
120
164
exp = np .array ([0 , 0 , 0 , 1 , 1 , 0 ], dtype = np .intp )
121
165
tm .assert_numpy_array_equal (codes , exp )
122
- tm .assert_index_equal (uniques , pd . PeriodIndex ([v1 , v2 ]))
166
+ tm .assert_index_equal (uniques , PeriodIndex ([v1 , v2 ]))
123
167
124
168
codes , uniques = algos .factorize (x , sort = True )
125
169
exp = np .array ([0 , 0 , 0 , 1 , 1 , 0 ], dtype = np .intp )
126
170
tm .assert_numpy_array_equal (codes , exp )
127
- tm .assert_index_equal (uniques , pd . PeriodIndex ([v1 , v2 ]))
171
+ tm .assert_index_equal (uniques , PeriodIndex ([v1 , v2 ]))
128
172
129
173
# GH 5986
130
- v1 = pd . to_timedelta ("1 day 1 min" )
131
- v2 = pd . to_timedelta ("1 day" )
174
+ v1 = to_timedelta ("1 day 1 min" )
175
+ v2 = to_timedelta ("1 day" )
132
176
x = Series ([v1 , v2 , v1 , v1 , v2 , v2 , v1 ])
133
177
codes , uniques = algos .factorize (x )
134
178
exp = np .array ([0 , 1 , 0 , 0 , 1 , 1 , 0 ], dtype = np .intp )
135
179
tm .assert_numpy_array_equal (codes , exp )
136
- tm .assert_index_equal (uniques , pd . to_timedelta ([v1 , v2 ]))
180
+ tm .assert_index_equal (uniques , to_timedelta ([v1 , v2 ]))
137
181
138
182
codes , uniques = algos .factorize (x , sort = True )
139
183
exp = np .array ([1 , 0 , 1 , 1 , 0 , 0 , 1 ], dtype = np .intp )
140
184
tm .assert_numpy_array_equal (codes , exp )
141
- tm .assert_index_equal (uniques , pd . to_timedelta ([v2 , v1 ]))
185
+ tm .assert_index_equal (uniques , to_timedelta ([v2 , v1 ]))
142
186
143
187
def test_factorize_nan (self ):
144
188
# nan should map to na_sentinel, not reverse_indexer[na_sentinel]
@@ -241,7 +285,7 @@ def test_string_factorize(self, writable):
241
285
tm .assert_numpy_array_equal (uniques , expected_uniques )
242
286
243
287
def test_object_factorize (self , writable ):
244
- data = np .array (["a" , "c" , None , np .nan , "a" , "b" , pd . NaT , "c" ], dtype = object )
288
+ data = np .array (["a" , "c" , None , np .nan , "a" , "b" , NaT , "c" ], dtype = object )
245
289
data .setflags (write = writable )
246
290
expected_codes = np .array ([0 , 1 , - 1 , - 1 , 0 , 2 , - 1 , 1 ], dtype = np .intp )
247
291
expected_uniques = np .array (["a" , "c" , "b" ], dtype = object )
@@ -404,7 +448,7 @@ def test_object_refcount_bug(self):
404
448
405
449
def test_on_index_object (self ):
406
450
407
- mindex = pd . MultiIndex .from_arrays (
451
+ mindex = MultiIndex .from_arrays (
408
452
[np .arange (5 ).repeat (5 ), np .tile (np .arange (5 ), 5 )]
409
453
)
410
454
expected = mindex .values
@@ -456,7 +500,7 @@ def test_datetime64_dtype_array_returned(self):
456
500
dtype = "M8[ns]" ,
457
501
)
458
502
459
- dt_index = pd . to_datetime (
503
+ dt_index = to_datetime (
460
504
[
461
505
"2015-01-03T00:00:00.000000000" ,
462
506
"2015-01-01T00:00:00.000000000" ,
@@ -493,7 +537,7 @@ def test_timedelta64_dtype_array_returned(self):
493
537
# GH 9431
494
538
expected = np .array ([31200 , 45678 , 10000 ], dtype = "m8[ns]" )
495
539
496
- td_index = pd . to_timedelta ([31200 , 45678 , 31200 , 10000 , 45678 ])
540
+ td_index = to_timedelta ([31200 , 45678 , 31200 , 10000 , 45678 ])
497
541
result = algos .unique (td_index )
498
542
tm .assert_numpy_array_equal (result , expected )
499
543
assert result .dtype == expected .dtype
@@ -772,7 +816,7 @@ def test_basic(self):
772
816
773
817
def test_i8 (self ):
774
818
775
- arr = pd . date_range ("20130101" , periods = 3 ).values
819
+ arr = date_range ("20130101" , periods = 3 ).values
776
820
result = algos .isin (arr , [arr [0 ]])
777
821
expected = np .array ([True , False , False ])
778
822
tm .assert_numpy_array_equal (result , expected )
@@ -785,7 +829,7 @@ def test_i8(self):
785
829
expected = np .array ([True , True , False ])
786
830
tm .assert_numpy_array_equal (result , expected )
787
831
788
- arr = pd . timedelta_range ("1 day" , periods = 3 ).values
832
+ arr = timedelta_range ("1 day" , periods = 3 ).values
789
833
result = algos .isin (arr , [arr [0 ]])
790
834
expected = np .array ([True , False , False ])
791
835
tm .assert_numpy_array_equal (result , expected )
@@ -799,7 +843,7 @@ def test_i8(self):
799
843
tm .assert_numpy_array_equal (result , expected )
800
844
801
845
def test_large (self ):
802
- s = pd . date_range ("20000101" , periods = 2000000 , freq = "s" ).values
846
+ s = date_range ("20000101" , periods = 2000000 , freq = "s" ).values
803
847
result = algos .isin (s , s [0 :2 ])
804
848
expected = np .zeros (len (s ), dtype = bool )
805
849
expected [0 ] = True
@@ -950,27 +994,27 @@ def test_different_nans_as_float64(self):
950
994
def test_isin_int_df_string_search (self ):
951
995
"""Comparing df with int`s (1,2) with a string at isin() ("1")
952
996
-> should not match values because int 1 is not equal str 1"""
953
- df = pd . DataFrame ({"values" : [1 , 2 ]})
997
+ df = DataFrame ({"values" : [1 , 2 ]})
954
998
result = df .isin (["1" ])
955
- expected_false = pd . DataFrame ({"values" : [False , False ]})
999
+ expected_false = DataFrame ({"values" : [False , False ]})
956
1000
tm .assert_frame_equal (result , expected_false )
957
1001
958
1002
@pytest .mark .xfail (reason = "problem related with issue #34125" )
959
1003
def test_isin_nan_df_string_search (self ):
960
1004
"""Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
961
1005
-> should not match values because np.nan is not equal str NaN"""
962
- df = pd . DataFrame ({"values" : [np .nan , 2 ]})
1006
+ df = DataFrame ({"values" : [np .nan , 2 ]})
963
1007
result = df .isin (["NaN" ])
964
- expected_false = pd . DataFrame ({"values" : [False , False ]})
1008
+ expected_false = DataFrame ({"values" : [False , False ]})
965
1009
tm .assert_frame_equal (result , expected_false )
966
1010
967
1011
@pytest .mark .xfail (reason = "problem related with issue #34125" )
968
1012
def test_isin_float_df_string_search (self ):
969
1013
"""Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
970
1014
-> should not match values because float 1.4245 is not equal str 1.4245"""
971
- df = pd . DataFrame ({"values" : [1.4245 , 2.32441 ]})
1015
+ df = DataFrame ({"values" : [1.4245 , 2.32441 ]})
972
1016
result = df .isin (["1.4245" ])
973
- expected_false = pd . DataFrame ({"values" : [False , False ]})
1017
+ expected_false = DataFrame ({"values" : [False , False ]})
974
1018
tm .assert_frame_equal (result , expected_false )
975
1019
976
1020
@@ -1016,8 +1060,8 @@ def test_value_counts_dtypes(self):
1016
1060
algos .value_counts (["1" , 1 ], bins = 1 )
1017
1061
1018
1062
def test_value_counts_nat (self ):
1019
- td = Series ([np .timedelta64 (10000 ), pd . NaT ], dtype = "timedelta64[ns]" )
1020
- dt = pd . to_datetime (["NaT" , "2014-01-01" ])
1063
+ td = Series ([np .timedelta64 (10000 ), NaT ], dtype = "timedelta64[ns]" )
1064
+ dt = to_datetime (["NaT" , "2014-01-01" ])
1021
1065
1022
1066
for s in [td , dt ]:
1023
1067
vc = algos .value_counts (s )
@@ -1051,7 +1095,7 @@ def test_value_counts_datetime_outofbounds(self):
1051
1095
tm .assert_series_equal (res , exp )
1052
1096
1053
1097
# GH 12424
1054
- res = pd . to_datetime (Series (["2362-01-01" , np .nan ]), errors = "ignore" )
1098
+ res = to_datetime (Series (["2362-01-01" , np .nan ]), errors = "ignore" )
1055
1099
exp = Series (["2362-01-01" , np .nan ], dtype = object )
1056
1100
tm .assert_series_equal (res , exp )
1057
1101
@@ -1323,9 +1367,9 @@ def test_datetime_likes(self):
1323
1367
cases = [
1324
1368
np .array ([Timestamp (d ) for d in dt ]),
1325
1369
np .array ([Timestamp (d , tz = "US/Eastern" ) for d in dt ]),
1326
- np .array ([pd . Period (d , freq = "D" ) for d in dt ]),
1370
+ np .array ([Period (d , freq = "D" ) for d in dt ]),
1327
1371
np .array ([np .datetime64 (d ) for d in dt ]),
1328
- np .array ([pd . Timedelta (d ) for d in td ]),
1372
+ np .array ([Timedelta (d ) for d in td ]),
1329
1373
]
1330
1374
1331
1375
exp_first = np .array (
@@ -1530,7 +1574,7 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
1530
1574
s .loc [500 ] = np .nan
1531
1575
elif htable == ht .PyObjectHashTable :
1532
1576
# use different NaN types for object column
1533
- s .loc [500 :502 ] = [np .nan , None , pd . NaT ]
1577
+ s .loc [500 :502 ] = [np .nan , None , NaT ]
1534
1578
1535
1579
# create duplicated selection
1536
1580
s_duplicated = s .sample (frac = 3 , replace = True ).reset_index (drop = True )
@@ -1570,7 +1614,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
1570
1614
s .loc [500 ] = np .nan
1571
1615
elif htable == ht .PyObjectHashTable :
1572
1616
# use different NaN types for object column
1573
- s .loc [500 :502 ] = [np .nan , None , pd . NaT ]
1617
+ s .loc [500 :502 ] = [np .nan , None , NaT ]
1574
1618
1575
1619
# create duplicated selection
1576
1620
s_duplicated = s .sample (frac = 3 , replace = True ).reset_index (drop = True )
@@ -2307,7 +2351,7 @@ def test_diff_datetimelike_nat(self, dtype):
2307
2351
tm .assert_numpy_array_equal (result , expected .T )
2308
2352
2309
2353
def test_diff_ea_axis (self ):
2310
- dta = pd . date_range ("2016-01-01" , periods = 3 , tz = "US/Pacific" )._data
2354
+ dta = date_range ("2016-01-01" , periods = 3 , tz = "US/Pacific" )._data
2311
2355
2312
2356
msg = "cannot diff DatetimeArray on axis=1"
2313
2357
with pytest .raises (ValueError , match = msg ):
0 commit comments