DEPR: Index inferring numeric dtype from ndarray[object] (pandas-dev#42870)

jbrockmendel · web-flow · commit 126a19d038b6 · 2021-08-08T19:34:29.000-04:00
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -219,7 +219,7 @@ def box_expected(expected, box_cls, transpose=True):
         else:
             expected = pd.array(expected)
     elif box_cls is Index:
-        expected = Index(expected)
+        expected = Index._with_infer(expected)
     elif box_cls is Series:
         expected = Series(expected)
     elif box_cls is DataFrame:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2031,7 +2031,9 @@ def _validate_listlike(self, value):
         from pandas import Index
 
         # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
-        to_add = Index(value, tupleize_cols=False).difference(self.categories)
+        to_add = Index._with_infer(value, tupleize_cols=False).difference(
+            self.categories
+        )
 
         # no assignments of values not in categories, but it's always ok to set
         # something to np.nan
@@ -2741,6 +2743,7 @@ def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
         # as values but its codes are by def [0, ..., len(n_categories) - 1]
         cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
         cat = Categorical.from_codes(cat_codes, dtype=values.dtype)
+
         categories = CategoricalIndex(cat)
         codes = values.codes
     else:
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -16,7 +16,10 @@
 
 from pandas._config import get_option
 
-from pandas._libs import NaT
+from pandas._libs import (
+    NaT,
+    lib,
+)
 from pandas._libs.interval import (
     VALID_CLOSED,
     Interval,
@@ -225,6 +228,9 @@ def __new__(
             left, right, infer_closed = intervals_to_interval_bounds(
                 data, validate_closed=closed is None
             )
+            if left.dtype == object:
+                left = lib.maybe_convert_objects(left)
+                right = lib.maybe_convert_objects(right)
             closed = closed or infer_closed
 
         return cls._simple_new(
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -529,7 +529,7 @@ def validate_categories(categories, fastpath: bool = False) -> Index:
                 f"Parameter 'categories' must be list-like, was {repr(categories)}"
             )
         elif not isinstance(categories, ABCIndex):
-            categories = Index(categories, tupleize_cols=False)
+            categories = Index._with_infer(categories, tupleize_cols=False)
 
         if not fastpath:
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -455,7 +455,7 @@ def _get_index() -> Index:
             if self.grouper.nkeys > 1:
                 index = MultiIndex.from_tuples(keys, names=self.grouper.names)
             else:
-                index = Index(keys, name=self.grouper.names[0])
+                index = Index._with_infer(keys, name=self.grouper.names[0])
             return index
 
         if isinstance(values[0], dict):
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -646,7 +646,7 @@ def group_index(self) -> Index:
             return self._group_index
 
         uniques = self._codes_and_uniques[1]
-        return Index(uniques, name=self.name)
+        return Index._with_infer(uniques, name=self.name)
 
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -471,7 +471,9 @@ def __new__(
                 arr = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
 
                 if dtype is None:
-                    arr = _maybe_cast_data_without_dtype(arr)
+                    arr = _maybe_cast_data_without_dtype(
+                        arr, cast_numeric_deprecated=True
+                    )
                     dtype = arr.dtype
 
                     if kwargs:
@@ -504,6 +506,15 @@ def __new__(
             # other iterable of some kind
 
             subarr = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
+            if dtype is None:
+                # with e.g. a list [1, 2, 3] casting to numeric is _not_ deprecated
+                # error: Incompatible types in assignment (expression has type
+                # "Union[ExtensionArray, ndarray[Any, Any]]", variable has type
+                # "ndarray[Any, Any]")
+                subarr = _maybe_cast_data_without_dtype(  # type: ignore[assignment]
+                    subarr, cast_numeric_deprecated=False
+                )
+                dtype = subarr.dtype
             return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
 
     @classmethod
@@ -637,6 +648,26 @@ def _simple_new(cls: type[_IndexT], values, name: Hashable = None) -> _IndexT:
 
         return result
 
+    @classmethod
+    def _with_infer(cls, *args, **kwargs):
+        """
+        Constructor that uses the 1.0.x behavior inferring numeric dtypes
+        for ndarray[object] inputs.
+        """
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning)
+            result = cls(*args, **kwargs)
+
+        if result.dtype == object and not result._is_multi:
+            # error: Argument 1 to "maybe_convert_objects" has incompatible type
+            # "Union[ExtensionArray, ndarray[Any, Any]]"; expected
+            # "ndarray[Any, Any]"
+            values = lib.maybe_convert_objects(result._values)  # type: ignore[arg-type]
+            if values.dtype.kind in ["i", "u", "f"]:
+                return Index(values, name=result.name)
+
+        return result
+
     @cache_readonly
     def _constructor(self: _IndexT) -> type[_IndexT]:
         return type(self)
@@ -2609,7 +2640,7 @@ def fillna(self, value=None, downcast=None):
             if downcast is None:
                 # no need to care metadata other than name
                 # because it can't have freq if
-                return Index(result, name=self.name)
+                return Index._with_infer(result, name=self.name)
         return self._view()
 
     def dropna(self: _IndexT, how: str_t = "any") -> _IndexT:
@@ -4009,7 +4040,7 @@ def _reindex_non_unique(
         if isinstance(self, ABCMultiIndex):
             new_index = type(self).from_tuples(new_labels, names=self.names)
         else:
-            new_index = Index(new_labels, name=self.name)
+            new_index = Index._with_infer(new_labels, name=self.name)
         return new_index, indexer, new_indexer
 
     # --------------------------------------------------------------------
@@ -4450,9 +4481,12 @@ def _wrap_joined_index(self: _IndexT, joined: ArrayLike, other: _IndexT) -> _Ind
 
         if isinstance(self, ABCMultiIndex):
             name = self.names if self.names == other.names else None
+            # error: Incompatible return value type (got "MultiIndex",
+            # expected "_IndexT")
+            return self._constructor(joined, name=name)  # type: ignore[return-value]
         else:
             name = get_op_result_name(self, other)
-        return self._constructor(joined, name=name)
+            return self._constructor._with_infer(joined, name=name)
 
     # --------------------------------------------------------------------
     # Uncategorized Methods
@@ -4805,7 +4839,7 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
         to_concat_vals = [x._values for x in to_concat]
 
         result = concat_compat(to_concat_vals)
-        return Index(result, name=name)
+        return Index._with_infer(result, name=name)
 
     def putmask(self, mask, value) -> Index:
         """
@@ -5752,7 +5786,7 @@ def map(self, mapper, na_action=None):
         ):
             return self._constructor(new_values, **attributes)
 
-        return Index(new_values, **attributes)
+        return Index._with_infer(new_values, **attributes)
 
     # TODO: De-duplicate with map, xref GH#32349
     @final
@@ -6228,7 +6262,7 @@ def insert(self, loc: int, item) -> Index:
         # Use Index constructor to ensure we get tuples cast correctly.
         item = Index([item], dtype=self.dtype)._values
         idx = np.concatenate((arr[:loc], item, arr[loc:]))
-        return Index(idx, name=self.name)
+        return Index._with_infer(idx, name=self.name)
 
     def drop(self, labels, errors: str_t = "raise") -> Index:
         """
@@ -6313,8 +6347,8 @@ def _arith_method(self, other, op):
 
         result = op(Series(self), other)
         if isinstance(result, tuple):
-            return (Index(result[0]), Index(result[1]))
-        return Index(result)
+            return (Index._with_infer(result[0]), Index(result[1]))
+        return Index._with_infer(result)
 
     @final
     def _unary_method(self, op):
@@ -6637,7 +6671,7 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind
 
     if isinstance(index_like, ABCSeries):
         name = index_like.name
-        return Index(index_like, name=name, copy=copy)
+        return Index._with_infer(index_like, name=name, copy=copy)
 
     if is_iterator(index_like):
         index_like = list(index_like)
@@ -6653,10 +6687,9 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind
 
             return MultiIndex.from_arrays(index_like)
         else:
-            return Index(index_like, copy=copy, tupleize_cols=False)
+            return Index._with_infer(index_like, copy=copy, tupleize_cols=False)
     else:
-
-        return Index(index_like, copy=copy)
+        return Index._with_infer(index_like, copy=copy)
 
 
 def ensure_has_len(seq):
@@ -6717,14 +6750,26 @@ def maybe_extract_name(name, obj, cls) -> Hashable:
     return name
 
 
-def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike:
+_cast_depr_msg = (
+    "In a future version, passing an object-dtype arraylike to pd.Index will "
+    "not infer numeric values to numeric dtype (matching the Series behavior). "
+    "To retain the old behavior, explicitly pass the desired dtype or use the "
+    "desired Index subclass"
+)
+
+
+def _maybe_cast_data_without_dtype(
+    subarr: np.ndarray, cast_numeric_deprecated: bool = True
+) -> ArrayLike:
     """
     If we have an arraylike input but no passed dtype, try to infer
     a supported dtype.
 
     Parameters
     ----------
     subarr : np.ndarray[object]
+    cast_numeric_deprecated : bool, default True
+        Whether to issue a FutureWarning when inferring numeric dtypes.
 
     Returns
     -------
@@ -6739,6 +6784,17 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike:
         convert_interval=True,
         dtype_if_all_nat=np.dtype("datetime64[ns]"),
     )
+    if result.dtype.kind in ["i", "u", "f"]:
+        if not cast_numeric_deprecated:
+            # i.e. we started with a list, not an ndarray[object]
+            return result
+
+        warnings.warn(
+            "In a future version, the Index constructor will not infer numeric "
+            "dtypes when passed object-dtype sequences (matching Series behavior)",
+            FutureWarning,
+            stacklevel=3,
+        )
     if result.dtype.kind in ["b", "c"]:
         return subarr
     result = ensure_wrapped_if_datetimelike(result)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2150,7 +2150,7 @@ def append(self, other):
         try:
             return MultiIndex.from_tuples(new_tuples, names=self.names)
         except (TypeError, IndexError):
-            return Index(new_tuples)
+            return Index._with_infer(new_tuples)
 
     def argsort(self, *args, **kwargs) -> np.ndarray:
         return self._values.argsort(*args, **kwargs)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -322,7 +322,7 @@ def cons_row(x):
                     out = out.get_level_values(0)
                 return out
             else:
-                return Index(result, name=name)
+                return Index._with_infer(result, name=name)
         else:
             index = self._orig.index
             # This is a mess.
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -226,7 +226,7 @@ def _box_as_indexlike(
     if is_datetime64_dtype(dt_array):
         tz = "utc" if utc else None
         return DatetimeIndex(dt_array, tz=tz, name=name)
-    return Index(dt_array, name=name)
+    return Index(dt_array, name=name, dtype=dt_array.dtype)
 
 
 def _convert_and_box_cache(
@@ -517,7 +517,7 @@ def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index:
     """
     to_datetime specalized to the case where a 'unit' is passed.
     """
-    arg = getattr(arg, "_values", arg)
+    arg = getattr(arg, "_values", arg)  # TODO: extract_array
 
     # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
     # because it expects an ndarray argument
@@ -529,7 +529,7 @@ def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index:
 
     if errors == "ignore":
         # Index constructor _may_ infer to DatetimeIndex
-        result = Index(arr, name=name)
+        result = Index._with_infer(arr, name=name)
     else:
         result = DatetimeIndex(arr, name=name)
 
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -329,7 +329,9 @@ def _hash_ndarray(
             )
 
             codes, categories = factorize(vals, sort=False)
-            cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
+            cat = Categorical(
+                codes, Index._with_infer(categories), ordered=False, fastpath=True
+            )
             return _hash_categorical(cat, encoding, hash_key)
 
         try:
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
@@ -88,7 +88,7 @@ def test_astype_index(all_data, dropna):
         other = all_data
 
     dtype = all_data.dtype
-    idx = pd.Index(np.array(other))
+    idx = pd.Index._with_infer(np.array(other))
     assert isinstance(idx, ABCIndex)
 
     result = idx.astype(dtype)
diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
@@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
         _, uniques = pd.factorize(data_for_grouping, sort=True)
 
         if as_index:
-            index = pd.Index(uniques, name="B")
+            index = pd.Index._with_infer(uniques, name="B")
             expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
             self.assert_series_equal(result, expected)
         else:
@@ -53,7 +53,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
         result = df.groupby("B", sort=False).A.mean()
         _, index = pd.factorize(data_for_grouping, sort=False)
 
-        index = pd.Index(index, name="B")
+        index = pd.Index._with_infer(index, name="B")
         expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
         self.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -1067,7 +1067,7 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
         result = getattr(df, op)()
         expected = DataFrame(
             {"value": expected_value},
-            index=Index([100, 200], dtype="object", name="ID"),
+            index=Index([100, 200], name="ID"),
         )
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -1126,7 +1126,7 @@ def test_apply_to_nullable_integer_returns_float(values, function):
     # https://github.com/pandas-dev/pandas/issues/32219
     output = 0.5 if function == "var" else 1.5
     arr = np.array([output] * 3, dtype=float)
-    idx = Index([1, 2, 3], dtype=object, name="a")
+    idx = Index([1, 2, 3], name="a")
     expected = DataFrame({"b": arr}, index=idx).astype("Float64")
 
     groups = DataFrame(values, dtype="Int64").groupby("a")
@@ -1146,7 +1146,7 @@ def test_groupby_sum_below_mincount_nullable_integer():
     # https://github.com/pandas-dev/pandas/issues/32861
     df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
     grouped = df.groupby("a")
-    idx = Index([0, 1, 2], dtype=object, name="a")
+    idx = Index([0, 1, 2], name="a")
 
     result = grouped["b"].sum(min_count=2)
     expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py
diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py

Original file line number	Diff line number	Diff line change
`@@ -529,7 +529,7 @@ def validate_categories(categories, fastpath: bool = False) -> Index:`
`529`	`529`	`f"Parameter 'categories' must be list-like, was {repr(categories)}"`
`530`	`530`	`)`
`531`	`531`	`elif not isinstance(categories, ABCIndex):`
`532`		`- categories = Index(categories, tupleize_cols=False)`
	`532`	`+ categories = Index._with_infer(categories, tupleize_cols=False)`
`533`	`533`
`534`	`534`	`if not fastpath:`
`535`	`535`
Original file line number	Diff line number	Diff line change
`@@ -329,7 +329,9 @@ def _hash_ndarray(`
`329`	`329`	`)`
`330`	`330`
`331`	`331`	`codes, categories = factorize(vals, sort=False)`
`332`		`- cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)`
	`332`	`+ cat = Categorical(`
	`333`	`+ codes, Index._with_infer(categories), ordered=False, fastpath=True`
	`334`	`+ )`
`333`	`335`	`return _hash_categorical(cat, encoding, hash_key)`
`334`	`336`
`335`	`337`	`try:`
Original file line number	Diff line number	Diff line change
`@@ -1067,7 +1067,7 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):`
`1067`	`1067`	`result = getattr(df, op)()`
`1068`	`1068`	`expected = DataFrame(`
`1069`	`1069`	`{"value": expected_value},`
`1070`		`- index=Index([100, 200], dtype="object", name="ID"),`
	`1070`	`+ index=Index([100, 200], name="ID"),`
`1071`	`1071`	`)`
`1072`	`1072`	`tm.assert_frame_equal(result, expected)`
`1073`	`1073`