Skip to content

Commit 126a19d

Browse files
authored
DEPR: Index inferring numeric dtype from ndarray[object] (pandas-dev#42870)
1 parent cda37fb commit 126a19d

File tree

21 files changed

+121
-41
lines changed

21 files changed

+121
-41
lines changed

pandas/_testing/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def box_expected(expected, box_cls, transpose=True):
219219
else:
220220
expected = pd.array(expected)
221221
elif box_cls is Index:
222-
expected = Index(expected)
222+
expected = Index._with_infer(expected)
223223
elif box_cls is Series:
224224
expected = Series(expected)
225225
elif box_cls is DataFrame:

pandas/core/arrays/categorical.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -2031,7 +2031,9 @@ def _validate_listlike(self, value):
20312031
from pandas import Index
20322032

20332033
# tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
2034-
to_add = Index(value, tupleize_cols=False).difference(self.categories)
2034+
to_add = Index._with_infer(value, tupleize_cols=False).difference(
2035+
self.categories
2036+
)
20352037

20362038
# no assignments of values not in categories, but it's always ok to set
20372039
# something to np.nan
@@ -2741,6 +2743,7 @@ def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
27412743
# as values but its codes are by def [0, ..., len(n_categories) - 1]
27422744
cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
27432745
cat = Categorical.from_codes(cat_codes, dtype=values.dtype)
2746+
27442747
categories = CategoricalIndex(cat)
27452748
codes = values.codes
27462749
else:

pandas/core/arrays/interval.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616

1717
from pandas._config import get_option
1818

19-
from pandas._libs import NaT
19+
from pandas._libs import (
20+
NaT,
21+
lib,
22+
)
2023
from pandas._libs.interval import (
2124
VALID_CLOSED,
2225
Interval,
@@ -225,6 +228,9 @@ def __new__(
225228
left, right, infer_closed = intervals_to_interval_bounds(
226229
data, validate_closed=closed is None
227230
)
231+
if left.dtype == object:
232+
left = lib.maybe_convert_objects(left)
233+
right = lib.maybe_convert_objects(right)
228234
closed = closed or infer_closed
229235

230236
return cls._simple_new(

pandas/core/dtypes/dtypes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ def validate_categories(categories, fastpath: bool = False) -> Index:
529529
f"Parameter 'categories' must be list-like, was {repr(categories)}"
530530
)
531531
elif not isinstance(categories, ABCIndex):
532-
categories = Index(categories, tupleize_cols=False)
532+
categories = Index._with_infer(categories, tupleize_cols=False)
533533

534534
if not fastpath:
535535

pandas/core/groupby/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,7 @@ def _get_index() -> Index:
455455
if self.grouper.nkeys > 1:
456456
index = MultiIndex.from_tuples(keys, names=self.grouper.names)
457457
else:
458-
index = Index(keys, name=self.grouper.names[0])
458+
index = Index._with_infer(keys, name=self.grouper.names[0])
459459
return index
460460

461461
if isinstance(values[0], dict):

pandas/core/groupby/grouper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,7 @@ def group_index(self) -> Index:
646646
return self._group_index
647647

648648
uniques = self._codes_and_uniques[1]
649-
return Index(uniques, name=self.name)
649+
return Index._with_infer(uniques, name=self.name)
650650

651651
@cache_readonly
652652
def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:

pandas/core/indexes/base.py

+70-14
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,9 @@ def __new__(
471471
arr = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
472472

473473
if dtype is None:
474-
arr = _maybe_cast_data_without_dtype(arr)
474+
arr = _maybe_cast_data_without_dtype(
475+
arr, cast_numeric_deprecated=True
476+
)
475477
dtype = arr.dtype
476478

477479
if kwargs:
@@ -504,6 +506,15 @@ def __new__(
504506
# other iterable of some kind
505507

506508
subarr = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
509+
if dtype is None:
510+
# with e.g. a list [1, 2, 3] casting to numeric is _not_ deprecated
511+
# error: Incompatible types in assignment (expression has type
512+
# "Union[ExtensionArray, ndarray[Any, Any]]", variable has type
513+
# "ndarray[Any, Any]")
514+
subarr = _maybe_cast_data_without_dtype( # type: ignore[assignment]
515+
subarr, cast_numeric_deprecated=False
516+
)
517+
dtype = subarr.dtype
507518
return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
508519

509520
@classmethod
@@ -637,6 +648,26 @@ def _simple_new(cls: type[_IndexT], values, name: Hashable = None) -> _IndexT:
637648

638649
return result
639650

651+
@classmethod
652+
def _with_infer(cls, *args, **kwargs):
653+
"""
654+
Constructor that uses the 1.0.x behavior inferring numeric dtypes
655+
for ndarray[object] inputs.
656+
"""
657+
with warnings.catch_warnings():
658+
warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning)
659+
result = cls(*args, **kwargs)
660+
661+
if result.dtype == object and not result._is_multi:
662+
# error: Argument 1 to "maybe_convert_objects" has incompatible type
663+
# "Union[ExtensionArray, ndarray[Any, Any]]"; expected
664+
# "ndarray[Any, Any]"
665+
values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type]
666+
if values.dtype.kind in ["i", "u", "f"]:
667+
return Index(values, name=result.name)
668+
669+
return result
670+
640671
@cache_readonly
641672
def _constructor(self: _IndexT) -> type[_IndexT]:
642673
return type(self)
@@ -2609,7 +2640,7 @@ def fillna(self, value=None, downcast=None):
26092640
if downcast is None:
26102641
# no need to care metadata other than name
26112642
# because it can't have freq if
2612-
return Index(result, name=self.name)
2643+
return Index._with_infer(result, name=self.name)
26132644
return self._view()
26142645

26152646
def dropna(self: _IndexT, how: str_t = "any") -> _IndexT:
@@ -4009,7 +4040,7 @@ def _reindex_non_unique(
40094040
if isinstance(self, ABCMultiIndex):
40104041
new_index = type(self).from_tuples(new_labels, names=self.names)
40114042
else:
4012-
new_index = Index(new_labels, name=self.name)
4043+
new_index = Index._with_infer(new_labels, name=self.name)
40134044
return new_index, indexer, new_indexer
40144045

40154046
# --------------------------------------------------------------------
@@ -4450,9 +4481,12 @@ def _wrap_joined_index(self: _IndexT, joined: ArrayLike, other: _IndexT) -> _Ind
44504481

44514482
if isinstance(self, ABCMultiIndex):
44524483
name = self.names if self.names == other.names else None
4484+
# error: Incompatible return value type (got "MultiIndex",
4485+
# expected "_IndexT")
4486+
return self._constructor(joined, name=name) # type: ignore[return-value]
44534487
else:
44544488
name = get_op_result_name(self, other)
4455-
return self._constructor(joined, name=name)
4489+
return self._constructor._with_infer(joined, name=name)
44564490

44574491
# --------------------------------------------------------------------
44584492
# Uncategorized Methods
@@ -4805,7 +4839,7 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
48054839
to_concat_vals = [x._values for x in to_concat]
48064840

48074841
result = concat_compat(to_concat_vals)
4808-
return Index(result, name=name)
4842+
return Index._with_infer(result, name=name)
48094843

48104844
def putmask(self, mask, value) -> Index:
48114845
"""
@@ -5752,7 +5786,7 @@ def map(self, mapper, na_action=None):
57525786
):
57535787
return self._constructor(new_values, **attributes)
57545788

5755-
return Index(new_values, **attributes)
5789+
return Index._with_infer(new_values, **attributes)
57565790

57575791
# TODO: De-duplicate with map, xref GH#32349
57585792
@final
@@ -6228,7 +6262,7 @@ def insert(self, loc: int, item) -> Index:
62286262
# Use Index constructor to ensure we get tuples cast correctly.
62296263
item = Index([item], dtype=self.dtype)._values
62306264
idx = np.concatenate((arr[:loc], item, arr[loc:]))
6231-
return Index(idx, name=self.name)
6265+
return Index._with_infer(idx, name=self.name)
62326266

62336267
def drop(self, labels, errors: str_t = "raise") -> Index:
62346268
"""
@@ -6313,8 +6347,8 @@ def _arith_method(self, other, op):
63136347

63146348
result = op(Series(self), other)
63156349
if isinstance(result, tuple):
6316-
return (Index(result[0]), Index(result[1]))
6317-
return Index(result)
6350+
return (Index._with_infer(result[0]), Index(result[1]))
6351+
return Index._with_infer(result)
63186352

63196353
@final
63206354
def _unary_method(self, op):
@@ -6637,7 +6671,7 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind
66376671

66386672
if isinstance(index_like, ABCSeries):
66396673
name = index_like.name
6640-
return Index(index_like, name=name, copy=copy)
6674+
return Index._with_infer(index_like, name=name, copy=copy)
66416675

66426676
if is_iterator(index_like):
66436677
index_like = list(index_like)
@@ -6653,10 +6687,9 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind
66536687

66546688
return MultiIndex.from_arrays(index_like)
66556689
else:
6656-
return Index(index_like, copy=copy, tupleize_cols=False)
6690+
return Index._with_infer(index_like, copy=copy, tupleize_cols=False)
66576691
else:
6658-
6659-
return Index(index_like, copy=copy)
6692+
return Index._with_infer(index_like, copy=copy)
66606693

66616694

66626695
def ensure_has_len(seq):
@@ -6717,14 +6750,26 @@ def maybe_extract_name(name, obj, cls) -> Hashable:
67176750
return name
67186751

67196752

6720-
def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike:
6753+
_cast_depr_msg = (
6754+
"In a future version, passing an object-dtype arraylike to pd.Index will "
6755+
"not infer numeric values to numeric dtype (matching the Series behavior). "
6756+
"To retain the old behavior, explicitly pass the desired dtype or use the "
6757+
"desired Index subclass"
6758+
)
6759+
6760+
6761+
def _maybe_cast_data_without_dtype(
6762+
subarr: np.ndarray, cast_numeric_deprecated: bool = True
6763+
) -> ArrayLike:
67216764
"""
67226765
If we have an arraylike input but no passed dtype, try to infer
67236766
a supported dtype.
67246767
67256768
Parameters
67266769
----------
67276770
subarr : np.ndarray[object]
6771+
cast_numeric_deprecated : bool, default True
6772+
Whether to issue a FutureWarning when inferring numeric dtypes.
67286773
67296774
Returns
67306775
-------
@@ -6739,6 +6784,17 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike:
67396784
convert_interval=True,
67406785
dtype_if_all_nat=np.dtype("datetime64[ns]"),
67416786
)
6787+
if result.dtype.kind in ["i", "u", "f"]:
6788+
if not cast_numeric_deprecated:
6789+
# i.e. we started with a list, not an ndarray[object]
6790+
return result
6791+
6792+
warnings.warn(
6793+
"In a future version, the Index constructor will not infer numeric "
6794+
"dtypes when passed object-dtype sequences (matching Series behavior)",
6795+
FutureWarning,
6796+
stacklevel=3,
6797+
)
67426798
if result.dtype.kind in ["b", "c"]:
67436799
return subarr
67446800
result = ensure_wrapped_if_datetimelike(result)

pandas/core/indexes/multi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2150,7 +2150,7 @@ def append(self, other):
21502150
try:
21512151
return MultiIndex.from_tuples(new_tuples, names=self.names)
21522152
except (TypeError, IndexError):
2153-
return Index(new_tuples)
2153+
return Index._with_infer(new_tuples)
21542154

21552155
def argsort(self, *args, **kwargs) -> np.ndarray:
21562156
return self._values.argsort(*args, **kwargs)

pandas/core/strings/accessor.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ def cons_row(x):
322322
out = out.get_level_values(0)
323323
return out
324324
else:
325-
return Index(result, name=name)
325+
return Index._with_infer(result, name=name)
326326
else:
327327
index = self._orig.index
328328
# This is a mess.

pandas/core/tools/datetimes.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def _box_as_indexlike(
226226
if is_datetime64_dtype(dt_array):
227227
tz = "utc" if utc else None
228228
return DatetimeIndex(dt_array, tz=tz, name=name)
229-
return Index(dt_array, name=name)
229+
return Index(dt_array, name=name, dtype=dt_array.dtype)
230230

231231

232232
def _convert_and_box_cache(
@@ -517,7 +517,7 @@ def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index:
517517
"""
518518
to_datetime specalized to the case where a 'unit' is passed.
519519
"""
520-
arg = getattr(arg, "_values", arg)
520+
arg = getattr(arg, "_values", arg) # TODO: extract_array
521521

522522
# GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
523523
# because it expects an ndarray argument
@@ -529,7 +529,7 @@ def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index:
529529

530530
if errors == "ignore":
531531
# Index constructor _may_ infer to DatetimeIndex
532-
result = Index(arr, name=name)
532+
result = Index._with_infer(arr, name=name)
533533
else:
534534
result = DatetimeIndex(arr, name=name)
535535

pandas/core/util/hashing.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,9 @@ def _hash_ndarray(
329329
)
330330

331331
codes, categories = factorize(vals, sort=False)
332-
cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
332+
cat = Categorical(
333+
codes, Index._with_infer(categories), ordered=False, fastpath=True
334+
)
333335
return _hash_categorical(cat, encoding, hash_key)
334336

335337
try:

pandas/tests/arrays/integer/test_dtypes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_astype_index(all_data, dropna):
8888
other = all_data
8989

9090
dtype = all_data.dtype
91-
idx = pd.Index(np.array(other))
91+
idx = pd.Index._with_infer(np.array(other))
9292
assert isinstance(idx, ABCIndex)
9393

9494
result = idx.astype(dtype)

pandas/tests/extension/base/groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
2525
_, uniques = pd.factorize(data_for_grouping, sort=True)
2626

2727
if as_index:
28-
index = pd.Index(uniques, name="B")
28+
index = pd.Index._with_infer(uniques, name="B")
2929
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
3030
self.assert_series_equal(result, expected)
3131
else:
@@ -53,7 +53,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
5353
result = df.groupby("B", sort=False).A.mean()
5454
_, index = pd.factorize(data_for_grouping, sort=False)
5555

56-
index = pd.Index(index, name="B")
56+
index = pd.Index._with_infer(index, name="B")
5757
expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
5858
self.assert_series_equal(result, expected)
5959

pandas/tests/frame/test_reductions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1067,7 +1067,7 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
10671067
result = getattr(df, op)()
10681068
expected = DataFrame(
10691069
{"value": expected_value},
1070-
index=Index([100, 200], dtype="object", name="ID"),
1070+
index=Index([100, 200], name="ID"),
10711071
)
10721072
tm.assert_frame_equal(result, expected)
10731073

pandas/tests/groupby/test_function.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1126,7 +1126,7 @@ def test_apply_to_nullable_integer_returns_float(values, function):
11261126
# https://github.com/pandas-dev/pandas/issues/32219
11271127
output = 0.5 if function == "var" else 1.5
11281128
arr = np.array([output] * 3, dtype=float)
1129-
idx = Index([1, 2, 3], dtype=object, name="a")
1129+
idx = Index([1, 2, 3], name="a")
11301130
expected = DataFrame({"b": arr}, index=idx).astype("Float64")
11311131

11321132
groups = DataFrame(values, dtype="Int64").groupby("a")
@@ -1146,7 +1146,7 @@ def test_groupby_sum_below_mincount_nullable_integer():
11461146
# https://github.com/pandas-dev/pandas/issues/32861
11471147
df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
11481148
grouped = df.groupby("a")
1149-
idx = Index([0, 1, 2], dtype=object, name="a")
1149+
idx = Index([0, 1, 2], name="a")
11501150

11511151
result = grouped["b"].sum(min_count=2)
11521152
expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")

0 commit comments

Comments
 (0)