BUG: pd.factorize upconverted unique values (from e.g. int8 -> int64)

topper-123 · topper-123 · commit 6cd938044b38 · 2021-04-24T10:31:19.000+01:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -705,6 +705,7 @@ Conversion
 - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
 - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
 - Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
+- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`xxxxx`)
 - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
 -
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -143,11 +143,14 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
             # until our algos support uint8 directly (see TODO)
             return np.asarray(values).astype("uint64"), np.dtype("bool")
         elif is_signed_integer_dtype(values):
-            return ensure_int64(values), np.dtype("int64")
+            dtype = getattr(values, "dtype", np.dtype("int64"))
+            return ensure_int64(values), dtype
         elif is_unsigned_integer_dtype(values):
-            return ensure_uint64(values), np.dtype("uint64")
+            dtype = getattr(values, "dtype", np.dtype("uint64"))
+            return ensure_uint64(values), dtype
         elif is_float_dtype(values):
-            return ensure_float64(values), np.dtype("float64")
+            dtype = getattr(values, "dtype", np.dtype("float64"))
+            return ensure_float64(values), dtype
         elif is_complex_dtype(values):
 
             # ignore the fact that we are casting to float
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -98,14 +98,14 @@ def test_basic(self):
         codes, uniques = algos.factorize(list(reversed(range(5))))
         exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
+        exp = np.array([4, 3, 2, 1, 0], dtype=np.int32)
         tm.assert_numpy_array_equal(uniques, exp)
 
         codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)
 
         exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
+        exp = np.array([0, 1, 2, 3, 4], dtype=np.int32)
         tm.assert_numpy_array_equal(uniques, exp)
 
         codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
@@ -246,6 +246,16 @@ def test_complex_sorting(self):
         with pytest.raises(TypeError, match=msg):
             algos.factorize(x17[::-1], sort=True)
 
+    def test_numeric_dtype_factorize(self, any_real_dtype):
+        dtype = any_real_dtype
+        data = np.array([1, 2, 2, 1], dtype=dtype)
+        expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
+        expected_uniques = np.array([1, 2], dtype=dtype)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
     def test_float64_factorize(self, writable):
         data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
         data.setflags(write=writable)

Original file line number	Diff line number	Diff line change
`@@ -705,6 +705,7 @@ Conversion`
`705`	`705`	- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
`706`	`706`	- Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
`707`	`707`	- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
	`708`	+- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`xxxxx`)
`708`	`709`	- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
`709`	`710`	`-`
`710`	`711`