12
12
import pandas .hashtable as htable
13
13
from pandas .compat import string_types
14
14
15
+
15
16
def match (to_match , values , na_sentinel = - 1 ):
16
17
"""
17
18
Compute locations of to_match into values
@@ -44,7 +45,8 @@ def match(to_match, values, na_sentinel=-1):
44
45
# replace but return a numpy array
45
46
# use a Series because it handles dtype conversions properly
46
47
from pandas .core .series import Series
47
- result = Series (result .ravel ()).replace (- 1 ,na_sentinel ).values .reshape (result .shape )
48
+ result = Series (result .ravel ()).replace (- 1 , na_sentinel ).values .\
49
+ reshape (result .shape )
48
50
49
51
return result
50
52
@@ -63,6 +65,7 @@ def unique(values):
63
65
uniques
64
66
"""
65
67
values = com ._asarray_tuplesafe (values )
68
+
66
69
f = lambda htype , caster : _unique_generic (values , htype , caster )
67
70
return _hashtable_algo (f , values .dtype )
68
71
@@ -95,9 +98,9 @@ def isin(comps, values):
95
98
# work-around for numpy < 1.8 and comparisions on py3
96
99
# faster for larger cases to use np.in1d
97
100
if (_np_version_under1p8 and compat .PY3 ) or len (comps ) > 1000000 :
98
- f = lambda x , y : np .in1d (x ,np .asarray (list (y )))
101
+ f = lambda x , y : np .in1d (x , np .asarray (list (y )))
99
102
else :
100
- f = lambda x , y : lib .ismember_int64 (x ,set (y ))
103
+ f = lambda x , y : lib .ismember_int64 (x , set (y ))
101
104
102
105
# may need i8 conversion for proper membership testing
103
106
if com .is_datetime64_dtype (comps ):
@@ -115,6 +118,7 @@ def isin(comps, values):
115
118
116
119
return f (comps , values )
117
120
121
+
118
122
def _hashtable_algo (f , dtype , return_dtype = None ):
119
123
"""
120
124
f(HashTable, type_caster) -> result
@@ -148,8 +152,6 @@ def _unique_generic(values, table_type, type_caster):
148
152
return type_caster (uniques )
149
153
150
154
151
-
152
-
153
155
def factorize (values , sort = False , order = None , na_sentinel = - 1 , size_hint = None ):
154
156
"""
155
157
Encode input values as an enumerated type or categorical variable
@@ -169,12 +171,15 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
169
171
-------
170
172
labels : the indexer to the original array
171
173
uniques : ndarray (1-d) or Index
172
- the unique values. Index is returned when passed values is Index or Series
174
+ the unique values. Index is returned when passed values is Index or
175
+ Series
173
176
174
- note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex
177
+ note: an array of Periods will ignore sort as it returns an always sorted
178
+ PeriodIndex
175
179
"""
176
180
if order is not None :
177
- msg = "order is deprecated. See https://github.com/pydata/pandas/issues/6926"
181
+ msg = "order is deprecated. See " \
182
+ "https://github.com/pydata/pandas/issues/6926"
178
183
warn (msg , FutureWarning , stacklevel = 2 )
179
184
180
185
from pandas .core .index import Index
@@ -203,10 +208,12 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
203
208
204
209
# order ints before strings
205
210
ordered = np .concatenate ([
206
- np .sort (np .array ([ e for i , e in enumerate (uniques ) if f (e ) ],dtype = object )) for f in [ lambda x : not isinstance (x ,string_types ),
207
- lambda x : isinstance (x ,string_types ) ]
208
- ])
209
- sorter = com ._ensure_platform_int (t .lookup (com ._ensure_object (ordered )))
211
+ np .sort (np .array ([e for i , e in enumerate (uniques ) if f (e )],
212
+ dtype = object )) for f in
213
+ [lambda x : not isinstance (x , string_types ),
214
+ lambda x : isinstance (x , string_types )]])
215
+ sorter = com ._ensure_platform_int (t .lookup (
216
+ com ._ensure_object (ordered )))
210
217
211
218
reverse_indexer = np .empty (len (sorter ), dtype = np .int_ )
212
219
reverse_indexer .put (sorter , np .arange (len (sorter )))
@@ -276,7 +283,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
276
283
is_period = com .is_period_arraylike (values )
277
284
is_datetimetz = com .is_datetimetz (values )
278
285
279
- if com .is_datetime_or_timedelta_dtype (dtype ) or is_period or is_datetimetz :
286
+ if com .is_datetime_or_timedelta_dtype (dtype ) or is_period or \
287
+ is_datetimetz :
280
288
281
289
if is_period :
282
290
values = PeriodIndex (values )
@@ -300,7 +308,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
300
308
else :
301
309
keys = keys .astype (dtype )
302
310
303
-
304
311
elif com .is_integer_dtype (dtype ):
305
312
values = com ._ensure_int64 (values )
306
313
keys , counts = htable .value_count_scalar64 (values , dropna )
@@ -322,7 +329,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
322
329
323
330
if bins is not None :
324
331
# TODO: This next line should be more efficient
325
- result = result .reindex (np .arange (len (cat .categories )), fill_value = 0 )
332
+ result = result .reindex (np .arange (len (cat .categories )),
333
+ fill_value = 0 )
326
334
result .index = bins [:- 1 ]
327
335
328
336
if sort :
@@ -525,12 +533,11 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr):
525
533
526
534
527
535
def nsmallest (arr , n , keep = 'first' ):
528
- '''
536
+ """
529
537
Find the indices of the n smallest values of a numpy array.
530
538
531
539
Note: Fails silently with NaN.
532
-
533
- '''
540
+ """
534
541
if keep == 'last' :
535
542
arr = arr [::- 1 ]
536
543
0 commit comments