2
2
3
3
import numpy as np
4
4
5
+ from pandas ._libs import lib
6
+
5
7
import pandas as pd
6
8
from pandas .util import testing as tm
7
9
8
- for imp in [' pandas.util' , ' pandas.tools.hashing' ]:
10
+ for imp in [" pandas.util" , " pandas.tools.hashing" ]:
9
11
try :
10
12
hashing = import_module (imp )
11
13
break
12
14
except (ImportError , TypeError , ValueError ):
13
15
pass
14
16
15
17
18
+ class MaybeConvertObjects :
19
+ def setup (self ):
20
+ N = 10 ** 5
21
+
22
+ data = list (range (N ))
23
+ data [0 ] = pd .NaT
24
+ data = np .array (data )
25
+ self .data = data
26
+
27
+ def time_maybe_convert_objects (self ):
28
+ lib .maybe_convert_objects (self .data )
29
+
30
+
16
31
class Factorize :
17
32
18
- params = [[True , False ], [' int' , ' uint' , ' float' , ' string' ]]
19
- param_names = [' sort' , ' dtype' ]
33
+ params = [[True , False ], [" int" , " uint" , " float" , " string" ]]
34
+ param_names = [" sort" , " dtype" ]
20
35
21
36
def setup (self , sort , dtype ):
22
- N = 10 ** 5
23
- data = {'int' : pd .Int64Index (np .arange (N ).repeat (5 )),
24
- 'uint' : pd .UInt64Index (np .arange (N ).repeat (5 )),
25
- 'float' : pd .Float64Index (np .random .randn (N ).repeat (5 )),
26
- 'string' : tm .makeStringIndex (N ).repeat (5 )}
37
+ N = 10 ** 5
38
+ data = {
39
+ "int" : pd .Int64Index (np .arange (N ).repeat (5 )),
40
+ "uint" : pd .UInt64Index (np .arange (N ).repeat (5 )),
41
+ "float" : pd .Float64Index (np .random .randn (N ).repeat (5 )),
42
+ "string" : tm .makeStringIndex (N ).repeat (5 ),
43
+ }
27
44
self .idx = data [dtype ]
28
45
29
46
def time_factorize (self , sort , dtype ):
@@ -32,15 +49,17 @@ def time_factorize(self, sort, dtype):
32
49
33
50
class FactorizeUnique :
34
51
35
- params = [[True , False ], [' int' , ' uint' , ' float' , ' string' ]]
36
- param_names = [' sort' , ' dtype' ]
52
+ params = [[True , False ], [" int" , " uint" , " float" , " string" ]]
53
+ param_names = [" sort" , " dtype" ]
37
54
38
55
def setup (self , sort , dtype ):
39
- N = 10 ** 5
40
- data = {'int' : pd .Int64Index (np .arange (N )),
41
- 'uint' : pd .UInt64Index (np .arange (N )),
42
- 'float' : pd .Float64Index (np .arange (N )),
43
- 'string' : tm .makeStringIndex (N )}
56
+ N = 10 ** 5
57
+ data = {
58
+ "int" : pd .Int64Index (np .arange (N )),
59
+ "uint" : pd .UInt64Index (np .arange (N )),
60
+ "float" : pd .Float64Index (np .arange (N )),
61
+ "string" : tm .makeStringIndex (N ),
62
+ }
44
63
self .idx = data [dtype ]
45
64
assert self .idx .is_unique
46
65
@@ -50,15 +69,17 @@ def time_factorize(self, sort, dtype):
50
69
51
70
class Duplicated :
52
71
53
- params = [[' first' , ' last' , False ], [' int' , ' uint' , ' float' , ' string' ]]
54
- param_names = [' keep' , ' dtype' ]
72
+ params = [[" first" , " last" , False ], [" int" , " uint" , " float" , " string" ]]
73
+ param_names = [" keep" , " dtype" ]
55
74
56
75
def setup (self , keep , dtype ):
57
- N = 10 ** 5
58
- data = {'int' : pd .Int64Index (np .arange (N ).repeat (5 )),
59
- 'uint' : pd .UInt64Index (np .arange (N ).repeat (5 )),
60
- 'float' : pd .Float64Index (np .random .randn (N ).repeat (5 )),
61
- 'string' : tm .makeStringIndex (N ).repeat (5 )}
76
+ N = 10 ** 5
77
+ data = {
78
+ "int" : pd .Int64Index (np .arange (N ).repeat (5 )),
79
+ "uint" : pd .UInt64Index (np .arange (N ).repeat (5 )),
80
+ "float" : pd .Float64Index (np .random .randn (N ).repeat (5 )),
81
+ "string" : tm .makeStringIndex (N ).repeat (5 ),
82
+ }
62
83
self .idx = data [dtype ]
63
84
# cache is_unique
64
85
self .idx .is_unique
@@ -69,15 +90,17 @@ def time_duplicated(self, keep, dtype):
69
90
70
91
class DuplicatedUniqueIndex :
71
92
72
- params = [' int' , ' uint' , ' float' , ' string' ]
73
- param_names = [' dtype' ]
93
+ params = [" int" , " uint" , " float" , " string" ]
94
+ param_names = [" dtype" ]
74
95
75
96
def setup (self , dtype ):
76
- N = 10 ** 5
77
- data = {'int' : pd .Int64Index (np .arange (N )),
78
- 'uint' : pd .UInt64Index (np .arange (N )),
79
- 'float' : pd .Float64Index (np .random .randn (N )),
80
- 'string' : tm .makeStringIndex (N )}
97
+ N = 10 ** 5
98
+ data = {
99
+ "int" : pd .Int64Index (np .arange (N )),
100
+ "uint" : pd .UInt64Index (np .arange (N )),
101
+ "float" : pd .Float64Index (np .random .randn (N )),
102
+ "string" : tm .makeStringIndex (N ),
103
+ }
81
104
self .idx = data [dtype ]
82
105
# cache is_unique
83
106
self .idx .is_unique
@@ -87,58 +110,77 @@ def time_duplicated_unique(self, dtype):
87
110
88
111
89
112
class Hashing :
90
-
91
113
def setup_cache (self ):
92
- N = 10 ** 5
114
+ N = 10 ** 5
93
115
94
116
df = pd .DataFrame (
95
- {'strings' : pd .Series (tm .makeStringIndex (10000 ).take (
96
- np .random .randint (0 , 10000 , size = N ))),
97
- 'floats' : np .random .randn (N ),
98
- 'ints' : np .arange (N ),
99
- 'dates' : pd .date_range ('20110101' , freq = 's' , periods = N ),
100
- 'timedeltas' : pd .timedelta_range ('1 day' , freq = 's' , periods = N )})
101
- df ['categories' ] = df ['strings' ].astype ('category' )
117
+ {
118
+ "strings" : pd .Series (
119
+ tm .makeStringIndex (10000 ).take (np .random .randint (0 , 10000 , size = N ))
120
+ ),
121
+ "floats" : np .random .randn (N ),
122
+ "ints" : np .arange (N ),
123
+ "dates" : pd .date_range ("20110101" , freq = "s" , periods = N ),
124
+ "timedeltas" : pd .timedelta_range ("1 day" , freq = "s" , periods = N ),
125
+ }
126
+ )
127
+ df ["categories" ] = df ["strings" ].astype ("category" )
102
128
df .iloc [10 :20 ] = np .nan
103
129
return df
104
130
105
131
def time_frame (self , df ):
106
132
hashing .hash_pandas_object (df )
107
133
108
134
def time_series_int (self , df ):
109
- hashing .hash_pandas_object (df [' ints' ])
135
+ hashing .hash_pandas_object (df [" ints" ])
110
136
111
137
def time_series_string (self , df ):
112
- hashing .hash_pandas_object (df [' strings' ])
138
+ hashing .hash_pandas_object (df [" strings" ])
113
139
114
140
def time_series_float (self , df ):
115
- hashing .hash_pandas_object (df [' floats' ])
141
+ hashing .hash_pandas_object (df [" floats" ])
116
142
117
143
def time_series_categorical (self , df ):
118
- hashing .hash_pandas_object (df [' categories' ])
144
+ hashing .hash_pandas_object (df [" categories" ])
119
145
120
146
def time_series_timedeltas (self , df ):
121
- hashing .hash_pandas_object (df [' timedeltas' ])
147
+ hashing .hash_pandas_object (df [" timedeltas" ])
122
148
123
149
def time_series_dates (self , df ):
124
- hashing .hash_pandas_object (df [' dates' ])
150
+ hashing .hash_pandas_object (df [" dates" ])
125
151
126
152
127
153
class Quantile :
128
- params = [[0 , 0.5 , 1 ],
129
- ['linear' , 'nearest' , 'lower' , 'higher' , 'midpoint' ],
130
- ['float' , 'int' , 'uint' ]]
131
- param_names = ['quantile' , 'interpolation' , 'dtype' ]
154
+ params = [
155
+ [0 , 0.5 , 1 ],
156
+ ["linear" , "nearest" , "lower" , "higher" , "midpoint" ],
157
+ ["float" , "int" , "uint" ],
158
+ ]
159
+ param_names = ["quantile" , "interpolation" , "dtype" ]
132
160
133
161
def setup (self , quantile , interpolation , dtype ):
134
- N = 10 ** 5
135
- data = {'int' : np .arange (N ),
136
- 'uint' : np .arange (N ).astype (np .uint64 ),
137
- 'float' : np .random .randn (N )}
162
+ N = 10 ** 5
163
+ data = {
164
+ "int" : np .arange (N ),
165
+ "uint" : np .arange (N ).astype (np .uint64 ),
166
+ "float" : np .random .randn (N ),
167
+ }
138
168
self .idx = pd .Series (data [dtype ].repeat (5 ))
139
169
140
170
def time_quantile (self , quantile , interpolation , dtype ):
141
171
self .idx .quantile (quantile , interpolation = interpolation )
142
172
143
173
174
+ class SortIntegerArray :
175
+ params = [10 ** 3 , 10 ** 5 ]
176
+
177
+ def setup (self , N ):
178
+ data = np .arange (N , dtype = float )
179
+ data [40 ] = np .nan
180
+ self .array = pd .array (data , dtype = "Int64" )
181
+
182
+ def time_argsort (self , N ):
183
+ self .array .argsort ()
184
+
185
+
144
186
from .pandas_vb_common import setup # noqa: F401 isort:skip
0 commit comments