Skip to content

Commit d4c99e3

Browse files
topper-123yeshsurya
authored andcommitted
BUG: pd.factorize should not upconvert unique values unnecessarily (pandas-dev#41132)
1 parent e79b57c commit d4c99e3

File tree

3 files changed

+27
-12
lines changed

3 files changed

+27
-12
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,7 @@ Conversion
705705
- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
706706
- Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
707707
- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
708+
- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`)
708709
- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
709710
-
710711

pandas/core/algorithms.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,11 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
143143
# until our algos support uint8 directly (see TODO)
144144
return np.asarray(values).astype("uint64"), np.dtype("bool")
145145
elif is_signed_integer_dtype(values):
146-
return ensure_int64(values), np.dtype("int64")
146+
return ensure_int64(values), values.dtype
147147
elif is_unsigned_integer_dtype(values):
148-
return ensure_uint64(values), np.dtype("uint64")
148+
return ensure_uint64(values), values.dtype
149149
elif is_float_dtype(values):
150-
return ensure_float64(values), np.dtype("float64")
150+
return ensure_float64(values), values.dtype
151151
elif is_complex_dtype(values):
152152

153153
# ignore the fact that we are casting to float

pandas/tests/test_algos.py

+23-9
Original file line numberDiff line numberDiff line change
@@ -95,29 +95,32 @@ def test_basic(self):
9595
exp = np.array(["a", "b", "c"], dtype=object)
9696
tm.assert_numpy_array_equal(uniques, exp)
9797

98-
codes, uniques = algos.factorize(list(reversed(range(5))))
98+
arr = np.arange(5, dtype=np.intp)[::-1]
99+
100+
codes, uniques = algos.factorize(arr)
99101
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
100102
tm.assert_numpy_array_equal(codes, exp)
101-
exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
103+
exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
102104
tm.assert_numpy_array_equal(uniques, exp)
103105

104-
codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)
105-
106+
codes, uniques = algos.factorize(arr, sort=True)
106107
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
107108
tm.assert_numpy_array_equal(codes, exp)
108-
exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
109+
exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
109110
tm.assert_numpy_array_equal(uniques, exp)
110111

111-
codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
112+
arr = np.arange(5.0)[::-1]
113+
114+
codes, uniques = algos.factorize(arr)
112115
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
113116
tm.assert_numpy_array_equal(codes, exp)
114-
exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64)
117+
exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
115118
tm.assert_numpy_array_equal(uniques, exp)
116119

117-
codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
120+
codes, uniques = algos.factorize(arr, sort=True)
118121
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
119122
tm.assert_numpy_array_equal(codes, exp)
120-
exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)
123+
exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
121124
tm.assert_numpy_array_equal(uniques, exp)
122125

123126
def test_mixed(self):
@@ -246,6 +249,17 @@ def test_complex_sorting(self):
246249
with pytest.raises(TypeError, match=msg):
247250
algos.factorize(x17[::-1], sort=True)
248251

252+
def test_numeric_dtype_factorize(self, any_real_dtype):
253+
# GH41132
254+
dtype = any_real_dtype
255+
data = np.array([1, 2, 2, 1], dtype=dtype)
256+
expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
257+
expected_uniques = np.array([1, 2], dtype=dtype)
258+
259+
codes, uniques = algos.factorize(data)
260+
tm.assert_numpy_array_equal(codes, expected_codes)
261+
tm.assert_numpy_array_equal(uniques, expected_uniques)
262+
249263
def test_float64_factorize(self, writable):
250264
data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
251265
data.setflags(write=writable)

0 commit comments

Comments
 (0)