Skip to content

Commit 542d52e

Browse files
committed
REGR: assure .unique of mixed strings does not stringize
closes pandas-dev#16107
1 parent f8b25c2 commit 542d52e

File tree

4 files changed

+89
-40
lines changed

4 files changed

+89
-40
lines changed

pandas/core/algorithms.py

+77-35
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
# dtype access #
4040
# --------------- #
4141

42-
def _ensure_data(values, dtype=None):
42+
def _ensure_data(values, dtype=None, infer=True):
4343
"""
4444
routine to ensure that our data is of the correct
4545
input dtype for lower-level routines
@@ -57,10 +57,15 @@ def _ensure_data(values, dtype=None):
5757
values : array-like
5858
dtype : pandas_dtype, optional
5959
coerce to this dtype
60+
infer : boolean, default True
61+
infer object dtypes
6062
6163
Returns
6264
-------
63-
(ndarray, pandas_dtype, algo dtype as a string)
65+
(ndarray,
66+
pandas_dtype,
67+
algo dtype as a string,
68+
inferred type as a string or None)
6469
6570
"""
6671

@@ -69,28 +74,40 @@ def _ensure_data(values, dtype=None):
6974
if is_bool_dtype(values) or is_bool_dtype(dtype):
7075
# we are actually coercing to uint64
7176
# until our algos suppport uint8 directly (see TODO)
72-
return np.asarray(values).astype('uint64'), 'bool', 'uint64'
77+
return np.asarray(values).astype('uint64'), 'bool', 'uint64', None
7378
elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
74-
return _ensure_int64(values), 'int64', 'int64'
79+
return _ensure_int64(values), 'int64', 'int64', None
7580
elif (is_unsigned_integer_dtype(values) or
7681
is_unsigned_integer_dtype(dtype)):
77-
return _ensure_uint64(values), 'uint64', 'uint64'
82+
return _ensure_uint64(values), 'uint64', 'uint64', None
7883
elif is_float_dtype(values) or is_float_dtype(dtype):
79-
return _ensure_float64(values), 'float64', 'float64'
84+
return _ensure_float64(values), 'float64', 'float64', None
8085
elif is_object_dtype(values) and dtype is None:
81-
return _ensure_object(np.asarray(values)), 'object', 'object'
86+
87+
# if we can infer a numeric then do this
88+
inferred = None
89+
if infer:
90+
inferred = lib.infer_dtype(values)
91+
if inferred in ['integer']:
92+
return _ensure_int64(values), 'int64', 'int64', inferred
93+
elif inferred in ['floating']:
94+
return _ensure_float64(values),
95+
'float64', 'float64', inferred
96+
97+
return (_ensure_object(np.asarray(values)),
98+
'object', 'object', inferred)
8299
elif is_complex_dtype(values) or is_complex_dtype(dtype):
83100

84101
# ignore the fact that we are casting to float
85102
# which discards complex parts
86103
with catch_warnings(record=True):
87104
values = _ensure_float64(values)
88-
return values, 'float64', 'float64'
105+
return values, 'float64', 'float64', None
89106

90-
except (TypeError, ValueError):
107+
except (TypeError, ValueError, OverflowError):
91108
# if we are trying to coerce to a dtype
92109
# and it is incompat this will fall thru to here
93-
return _ensure_object(values), 'object', 'object'
110+
return _ensure_object(values), 'object', 'object', None
94111

95112
# datetimelike
96113
if (needs_i8_conversion(values) or
@@ -111,7 +128,7 @@ def _ensure_data(values, dtype=None):
111128
values = DatetimeIndex(values)
112129
dtype = values.dtype
113130

114-
return values.asi8, dtype, 'int64'
131+
return values.asi8, dtype, 'int64', None
115132

116133
elif is_categorical_dtype(values) or is_categorical_dtype(dtype):
117134
values = getattr(values, 'values', values)
@@ -122,11 +139,11 @@ def _ensure_data(values, dtype=None):
122139
# until our algos suppport int* directly (not all do)
123140
values = _ensure_int64(values)
124141

125-
return values, dtype, 'int64'
142+
return values, dtype, 'int64', None
126143

127144
# we have failed, return object
128145
values = np.asarray(values)
129-
return _ensure_object(values), 'object', 'object'
146+
return _ensure_object(values), 'object', 'object', None
130147

131148

132149
def _reconstruct_data(values, dtype, original):
@@ -150,7 +167,13 @@ def _reconstruct_data(values, dtype, original):
150167
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
151168
values = Index(original)._shallow_copy(values, name=None)
152169
elif dtype is not None:
153-
values = values.astype(dtype)
170+
171+
# don't cast to object if we are numeric
172+
if is_object_dtype(dtype):
173+
if not is_numeric_dtype(values):
174+
values = values.astype(dtype)
175+
else:
176+
values = values.astype(dtype)
154177

155178
return values
156179

@@ -161,7 +184,7 @@ def _ensure_arraylike(values):
161184
"""
162185
if not isinstance(values, (np.ndarray, ABCCategorical,
163186
ABCIndexClass, ABCSeries)):
164-
values = np.array(values)
187+
values = np.array(values, dtype=object)
165188
return values
166189

167190

@@ -174,11 +197,13 @@ def _ensure_arraylike(values):
174197
}
175198

176199

177-
def _get_hashtable_algo(values):
200+
def _get_hashtable_algo(values, infer=False):
178201
"""
179202
Parameters
180203
----------
181204
values : arraylike
205+
infer : boolean, default False
206+
infer object dtypes
182207
183208
Returns
184209
-------
@@ -188,12 +213,12 @@ def _get_hashtable_algo(values):
188213
dtype,
189214
ndtype)
190215
"""
191-
values, dtype, ndtype = _ensure_data(values)
216+
values, dtype, ndtype, inferred = _ensure_data(values, infer=infer)
192217

193218
if ndtype == 'object':
194219

195220
# its cheaper to use a String Hash Table than Object
196-
if lib.infer_dtype(values) in ['string']:
221+
if inferred in ['string']:
197222
ndtype = 'string'
198223
else:
199224
ndtype = 'object'
@@ -202,24 +227,41 @@ def _get_hashtable_algo(values):
202227
return (htable, table, values, dtype, ndtype)
203228

204229

205-
def _get_data_algo(values, func_map):
230+
def _get_data_algo(values, func_map, dtype=None, infer=False):
231+
"""
232+
Parameters
233+
----------
234+
values : array-like
235+
func_map : an inferred -> function dict
236+
dtype : dtype, optional
237+
the requested dtype
238+
infer : boolean, default False
239+
infer object dtypes
240+
241+
Returns
242+
-------
243+
(function,
244+
values,
245+
ndtype)
246+
"""
206247

207248
if is_categorical_dtype(values):
208249
values = values._values_for_rank()
209250

210-
values, dtype, ndtype = _ensure_data(values)
251+
values, dtype, ndtype, inferred = _ensure_data(
252+
values, dtype=dtype, infer=infer)
211253
if ndtype == 'object':
212254

213255
# its cheaper to use a String Hash Table than Object
214-
if lib.infer_dtype(values) in ['string']:
256+
if inferred in ['string']:
215257
try:
216258
f = func_map['string']
217259
except KeyError:
218260
pass
219261

220262
f = func_map.get(ndtype, func_map['object'])
221263

222-
return f, values
264+
return f, values, ndtype
223265

224266

225267
# --------------- #
@@ -248,7 +290,7 @@ def match(to_match, values, na_sentinel=-1):
248290
"""
249291
values = com._asarray_tuplesafe(values)
250292
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
251-
to_match, _, _ = _ensure_data(to_match, dtype)
293+
to_match, _, _, _ = _ensure_data(to_match, dtype)
252294
table = htable(min(len(to_match), 1000000))
253295
table.map_locations(values)
254296
result = table.lookup(to_match)
@@ -344,7 +386,7 @@ def unique(values):
344386
return values.unique()
345387

346388
original = values
347-
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
389+
htable, _, values, dtype, ndtype = _get_hashtable_algo(values, infer=False)
348390

349391
table = htable(len(values))
350392
uniques = table.unique(values)
@@ -389,8 +431,8 @@ def isin(comps, values):
389431
if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
390432
values = np.array(list(values), dtype='object')
391433

392-
comps, dtype, _ = _ensure_data(comps)
393-
values, _, _ = _ensure_data(values, dtype=dtype)
434+
comps, dtype, _, _ = _ensure_data(comps)
435+
values, _, _, _ = _ensure_data(values, dtype=dtype)
394436

395437
# GH11232
396438
# work-around for numpy < 1.8 and comparisions on py3
@@ -499,7 +541,7 @@ def sort_mixed(values):
499541

500542
if sorter is None:
501543
# mixed types
502-
(hash_klass, _), values = _get_data_algo(values, _hashtables)
544+
(hash_klass, _), values, _ = _get_data_algo(values, _hashtables)
503545
t = hash_klass(len(values))
504546
t.map_locations(values)
505547
sorter = _ensure_platform_int(t.lookup(ordered))
@@ -545,8 +587,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
545587

546588
values = _ensure_arraylike(values)
547589
original = values
548-
values, dtype, _ = _ensure_data(values)
549-
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
590+
values, dtype, _, _ = _ensure_data(values)
591+
(hash_klass, vec_klass), values, _ = _get_data_algo(values, _hashtables)
550592

551593
table = hash_klass(size_hint or len(values))
552594
uniques = vec_klass()
@@ -660,7 +702,7 @@ def _value_counts_arraylike(values, dropna):
660702
"""
661703
values = _ensure_arraylike(values)
662704
original = values
663-
values, dtype, ndtype = _ensure_data(values)
705+
values, dtype, ndtype, inferred = _ensure_data(values)
664706

665707
if needs_i8_conversion(dtype):
666708
# i8
@@ -711,7 +753,7 @@ def duplicated(values, keep='first'):
711753
duplicated : ndarray
712754
"""
713755

714-
values, dtype, ndtype = _ensure_data(values)
756+
values, dtype, ndtype, inferred = _ensure_data(values)
715757
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
716758
return f(values, keep=keep)
717759

@@ -741,7 +783,7 @@ def mode(values):
741783
return Series(values.values.mode(), name=values.name)
742784
return values.mode()
743785

744-
values, dtype, ndtype = _ensure_data(values)
786+
values, dtype, ndtype, inferred = _ensure_data(values)
745787

746788
# TODO: this should support float64
747789
if ndtype not in ['int64', 'uint64', 'object']:
@@ -785,11 +827,11 @@ def rank(values, axis=0, method='average', na_option='keep',
785827
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
786828
"""
787829
if values.ndim == 1:
788-
f, values = _get_data_algo(values, _rank1d_functions)
830+
f, values, _ = _get_data_algo(values, _rank1d_functions)
789831
ranks = f(values, ties_method=method, ascending=ascending,
790832
na_option=na_option, pct=pct)
791833
elif values.ndim == 2:
792-
f, values = _get_data_algo(values, _rank2d_functions)
834+
f, values, _ = _get_data_algo(values, _rank2d_functions)
793835
ranks = f(values, axis=axis, ties_method=method,
794836
ascending=ascending, na_option=na_option, pct=pct)
795837
else:
@@ -1049,7 +1091,7 @@ def compute(self, method):
10491091
return dropped[slc].sort_values(ascending=ascending).head(n)
10501092

10511093
# fast method
1052-
arr, _, _ = _ensure_data(dropped.values)
1094+
arr, _, _, _ = _ensure_data(dropped.values)
10531095
if method == 'nlargest':
10541096
arr = -arr
10551097

pandas/core/categorical.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2088,8 +2088,9 @@ def _get_codes_for_values(values, categories):
20882088
values = _ensure_object(values)
20892089
categories = _ensure_object(categories)
20902090

2091-
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
2092-
(_, _), cats = _get_data_algo(categories, _hashtables)
2091+
(hash_klass, vec_klass), vals, dtype = _get_data_algo(
2092+
values, _hashtables, infer=False)
2093+
(_, _), cats, _ = _get_data_algo(categories, _hashtables, dtype=dtype)
20932094
t = hash_klass(len(cats))
20942095
t.map_locations(cats)
20952096
return coerce_indexer_dtype(t.lookup(vals), cats)

pandas/tests/indexes/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,7 @@ def test_get_unique_index(self):
323323

324324
idx = ind[[0] * 5]
325325
idx_unique = ind[[0]]
326+
326327
# We test against `idx_unique`, so first we make sure it's unique
327328
# and doesn't contain nans.
328329
self.assertTrue(idx_unique.is_unique)
@@ -336,7 +337,6 @@ def test_get_unique_index(self):
336337
tm.assert_index_equal(result, idx_unique)
337338

338339
# nans:
339-
340340
if not ind._can_hold_na:
341341
continue
342342

pandas/tests/test_algos.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,12 @@ def test_uint64_overflow(self):
387387
exp = np.array([1, 2, 2**63], dtype=np.uint64)
388388
tm.assert_numpy_array_equal(algos.unique(s), exp)
389389

390+
def test_nan_in_object_array(self):
391+
l = ['a', np.nan, 'c', 'c']
392+
result = pd.unique(l)
393+
expected = np.array(['a', np.nan, 'c'], dtype=object)
394+
tm.assert_numpy_array_equal(result, expected)
395+
390396
def test_categorical(self):
391397

392398
# we are expecting to return in the order
@@ -1375,11 +1381,11 @@ def test_int64_add_overflow():
13751381
class TestMode(tm.TestCase):
13761382

13771383
def test_no_mode(self):
1378-
exp = Series([], dtype=np.float64)
1384+
exp = Series([], dtype=object)
13791385
tm.assert_series_equal(algos.mode([]), exp)
13801386

1381-
# GH 15714
13821387
def test_mode_single(self):
1388+
# GH 15714
13831389
exp_single = [1]
13841390
data_single = [1]
13851391

0 commit comments

Comments
 (0)