Skip to content

Commit 6cd9380

Browse files
committed
BUG: pd.factorize upconverted unique values (from e.g. int8 -> int64)
1 parent fd67546 commit 6cd9380

File tree

3 files changed

+19
-5
lines changed

3 files changed

+19
-5
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,7 @@ Conversion
705705
- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
706706
- Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
707707
- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
708+
- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`xxxxx`)
708709
- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
709710
-
710711

pandas/core/algorithms.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,14 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
143143
# until our algos support uint8 directly (see TODO)
144144
return np.asarray(values).astype("uint64"), np.dtype("bool")
145145
elif is_signed_integer_dtype(values):
146-
return ensure_int64(values), np.dtype("int64")
146+
dtype = getattr(values, "dtype", np.dtype("int64"))
147+
return ensure_int64(values), dtype
147148
elif is_unsigned_integer_dtype(values):
148-
return ensure_uint64(values), np.dtype("uint64")
149+
dtype = getattr(values, "dtype", np.dtype("uint64"))
150+
return ensure_uint64(values), dtype
149151
elif is_float_dtype(values):
150-
return ensure_float64(values), np.dtype("float64")
152+
dtype = getattr(values, "dtype", np.dtype("float64"))
153+
return ensure_float64(values), dtype
151154
elif is_complex_dtype(values):
152155

153156
# ignore the fact that we are casting to float

pandas/tests/test_algos.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,14 @@ def test_basic(self):
9898
codes, uniques = algos.factorize(list(reversed(range(5))))
9999
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
100100
tm.assert_numpy_array_equal(codes, exp)
101-
exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
101+
exp = np.array([4, 3, 2, 1, 0], dtype=np.int32)
102102
tm.assert_numpy_array_equal(uniques, exp)
103103

104104
codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)
105105

106106
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
107107
tm.assert_numpy_array_equal(codes, exp)
108-
exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
108+
exp = np.array([0, 1, 2, 3, 4], dtype=np.int32)
109109
tm.assert_numpy_array_equal(uniques, exp)
110110

111111
codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
@@ -246,6 +246,16 @@ def test_complex_sorting(self):
246246
with pytest.raises(TypeError, match=msg):
247247
algos.factorize(x17[::-1], sort=True)
248248

249+
def test_numeric_dtype_factorize(self, any_real_dtype):
250+
dtype = any_real_dtype
251+
data = np.array([1, 2, 2, 1], dtype=dtype)
252+
expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
253+
expected_uniques = np.array([1, 2], dtype=dtype)
254+
255+
codes, uniques = algos.factorize(data)
256+
tm.assert_numpy_array_equal(codes, expected_codes)
257+
tm.assert_numpy_array_equal(uniques, expected_uniques)
258+
249259
def test_float64_factorize(self, writable):
250260
data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
251261
data.setflags(write=writable)

0 commit comments

Comments
 (0)