From 386c421a5b8ebaef199bbd87df8bc161e4d0e84e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 15:10:15 +0100 Subject: [PATCH 01/10] API: ignore empty range/object dtype in Index setop operations (string dtype compat) --- pandas/core/indexes/base.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e2f9c5e9868a9..d7159cd7eb16d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -19,7 +19,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( NaT, @@ -6233,8 +6236,16 @@ def _find_common_type_compat(self, target) -> DtypeObj: Implementation of find_common_type that adjusts for Index-specific special cases. """ + # breakpoint() target_dtype, _ = infer_dtype_from(target) + if using_string_dtype(): + from pandas.core.indexes.range import RangeIndex + + if len(self) == 0 or self.isna().all(): + if isinstance(self, RangeIndex) or self.dtype == np.object_: + return target_dtype + # special case: if one dtype is uint64 and the other a signed int, return object # See https://github.com/pandas-dev/pandas/issues/26778 for discussion # Now it's: From e809ac2802012f433d9633ef9e99391b6b6de367 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 19:28:12 +0100 Subject: [PATCH 02/10] only for empty + fix tests --- pandas/core/indexes/base.py | 15 ++++++++--- pandas/tests/frame/indexing/test_coercion.py | 7 +---- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/indexing/test_insert.py | 3 +-- pandas/tests/frame/indexing/test_setitem.py | 12 +++------ pandas/tests/frame/methods/test_dropna.py | 3 --- .../tests/frame/methods/test_reset_index.py | 3 --- pandas/tests/groupby/test_groupby.py | 2 +- .../tests/indexes/base_class/test_setops.py | 4 +-- pandas/tests/indexes/test_setops.py | 4 +++ pandas/tests/indexing/test_at.py | 7 +---- pandas/tests/indexing/test_loc.py | 27 +++++++++---------- pandas/tests/indexing/test_partial.py | 18 +++++-------- pandas/tests/series/indexing/test_indexing.py | 14 +++------- .../tests/series/indexing/test_set_value.py | 8 ++++-- pandas/tests/series/indexing/test_setitem.py | 13 +++++---- 16 files changed, 60 insertions(+), 82 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d7159cd7eb16d..167c03cc49596 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6236,15 +6236,22 @@ def _find_common_type_compat(self, target) -> DtypeObj: Implementation of find_common_type that adjusts for Index-specific special cases. """ - # breakpoint() target_dtype, _ = infer_dtype_from(target) if using_string_dtype(): + # special case: if left or right is a zero-length RangeIndex or + # Index[object], those can be created by the default empty constructors + # -> for that case ignore this dtype and always return the other from pandas.core.indexes.range import RangeIndex - if len(self) == 0 or self.isna().all(): - if isinstance(self, RangeIndex) or self.dtype == np.object_: - return target_dtype + if len(self) == 0 and ( + isinstance(self, RangeIndex) or self.dtype == np.object_ + ): + return target_dtype + if len(target) == 0 and ( + isinstance(target, RangeIndex) or target_dtype == np.object_ + ): + return self.dtype # special case: if one dtype is uint64 and the other a signed int, return object # See https://github.com/pandas-dev/pandas/issues/26778 for discussion diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index 1a454351b7085..472bfb7772a80 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -88,12 +88,7 @@ def test_26395(indexer_al): df["D"] = 0 indexer_al(df)["C", "D"] = 2 - expected = DataFrame( - {"D": [0, 0, 2]}, - index=["A", "B", "C"], - columns=pd.Index(["D"], dtype=object), - dtype=np.int64, - ) + expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) tm.assert_frame_equal(df, expected) with pytest.raises(TypeError, match="Invalid value"): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index a9bc485283985..0c99b08cb30c4 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1138,7 +1138,7 @@ def test_loc_setitem_datetimelike_with_inference(self): result = df.dtypes expected = Series( [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, - index=Index(list("ABCDEFGH"), dtype=object), + index=list("ABCDEFGH"), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index a1d60eb9626d6..b530cb98ef46c 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -68,8 +68,7 @@ def test_insert_with_columns_dups(self): df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) exp = DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], - columns=Index(["A", "A", "A"], dtype=object), + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] ) tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cfd7e91c4ceab..d0f2eeae62320 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -150,9 +150,7 @@ def test_setitem_empty_columns(self): df["X"] = df.index df["X"] = ["x", "y", "z"] exp = DataFrame( - data={"X": ["x", "y", "z"]}, - index=["A", "B", "C"], - columns=Index(["X"], dtype=object), + data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"] ) tm.assert_frame_equal(df, exp) @@ -169,9 +167,7 @@ def test_setitem_timestamp_empty_columns(self): df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, - index=range(3), - columns=Index(["now"], dtype=object), + [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"] ) tm.assert_frame_equal(df, expected) @@ -210,7 +206,7 @@ def test_setitem_period_preserves_dtype(self): result = DataFrame([]) result["a"] = data - expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object)) + expected = DataFrame({"a": data}, columns=["a"]) tm.assert_frame_equal(result, expected) @@ -930,7 +926,7 @@ def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() df["foo"] = 1 - expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64) + expected = DataFrame(columns=["foo"]).astype(np.int64) tm.assert_frame_equal(df, expected) def test_setitem_newcol_tuple_key(self, float_frame): diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 4a60dc09cfe07..11893d7fac1a4 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -184,7 +182,6 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 0b320075ed2d2..80da849cc59d4 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -644,7 +642,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338") @pytest.mark.parametrize( "array, dtype", [ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5bae9b1fd9882..4ff3db6fe089e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1278,7 +1278,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 0e9fb77d6e8dd..4038e3b136ceb 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -240,7 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): - expected_dtype = object if not first_list or not second_list else "str" + # expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -251,7 +251,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name, dtype=expected_dtype) + expected = Index(vals, name=expected_name) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 58b69d79c65ce..436c862db310e 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -539,10 +539,14 @@ def test_setop_with_categorical(index_flat, sort, method): result = getattr(index, method)(other, sort=sort) expected = getattr(index, method)(index, sort=sort) + if index.empty and method in ("union", "symmetric_difference"): + expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) result = getattr(index, method)(other[:5], sort=sort) expected = getattr(index, method)(index[:5], sort=sort) + if index.empty and method in ("union", "symmetric_difference"): + expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 10a8fa88b4b5e..e80acc230a320 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -13,7 +13,6 @@ CategoricalIndex, DataFrame, DatetimeIndex, - Index, MultiIndex, Series, Timestamp, @@ -67,11 +66,7 @@ def test_at_setitem_item_cache_cleared(self): df.at[0, "x"] = 4 df.at[0, "cost"] = 789 - expected = DataFrame( - {"x": [4], "cost": 789}, - index=[0], - columns=Index(["x", "cost"], dtype=object), - ) + expected = DataFrame({"x": [4], "cost": 789}, index=[0]) tm.assert_frame_equal(df, expected) # And in particular, check that the _item_cache has updated correctly. diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7aeded5a6cb7f..7c8e6026ad27c 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -766,9 +766,9 @@ def test_loc_setitem_empty_frame(self): # is inplace, so that dtype is retained sera = Series(val1, index=keys1, dtype=np.float64) serb = Series(val2, index=keys2) - expected = DataFrame( - {"A": sera, "B": serb}, columns=Index(["A", "B"], dtype=object) - ).reindex(index=index) + expected = DataFrame({"A": sera, "B": serb}, columns=Index(["A", "B"])).reindex( + index=index + ) tm.assert_frame_equal(df, expected) def test_loc_setitem_frame(self): @@ -966,7 +966,7 @@ def test_setitem_new_key_tz(self, indexer_sl): to_datetime(42).tz_localize("UTC"), to_datetime(666).tz_localize("UTC"), ] - expected = Series(vals, index=Index(["foo", "bar"], dtype=object)) + expected = Series(vals, index=Index(["foo", "bar"])) ser = Series(dtype=object) indexer_sl(ser)["foo"] = vals[0] @@ -1966,15 +1966,11 @@ def test_loc_setitem_empty_series_str_idx(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc["foo"] = 1 - tm.assert_series_equal(ser, Series([1], index=Index(["foo"], dtype=object))) + tm.assert_series_equal(ser, Series([1], index=Index(["foo"]))) ser.loc["bar"] = 3 - tm.assert_series_equal( - ser, Series([1, 3], index=Index(["foo", "bar"], dtype=object)) - ) + tm.assert_series_equal(ser, Series([1, 3], index=Index(["foo", "bar"]))) ser.loc[3] = 4 - tm.assert_series_equal( - ser, Series([1, 3, 4], index=Index(["foo", "bar", 3], dtype=object)) - ) + tm.assert_series_equal(ser, Series([1, 3, 4], index=Index(["foo", "bar", 3]))) def test_loc_setitem_incremental_with_dst(self): # GH#20724 @@ -1996,7 +1992,7 @@ def test_loc_setitem_incremental_with_dst(self): ], ids=["self", "to_datetime64", "to_pydatetime", "np.datetime64"], ) - def test_loc_setitem_datetime_keys_cast(self, conv): + def test_loc_setitem_datetime_keys_cast(self, conv, using_infer_string): # GH#9516, GH#51363 changed in 3.0 to not cast on Index.insert dt1 = Timestamp("20130101 09:00:00") dt2 = Timestamp("20130101 10:00:00") @@ -2004,10 +2000,13 @@ def test_loc_setitem_datetime_keys_cast(self, conv): df.loc[conv(dt1), "one"] = 100 df.loc[conv(dt2), "one"] = 200 + # breakpoint() expected = DataFrame( {"one": [100.0, 200.0]}, - index=Index([conv(dt1), conv(dt2)], dtype=object), - columns=Index(["one"], dtype=object), + index=Index( + [conv(dt1), conv(dt2)], dtype=None if using_infer_string else object + ), + columns=Index(["one"]), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 3dbdedbb94618..6f20d0e4e7cbf 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -30,7 +30,7 @@ def test_empty_frame_setitem_index_name_retained(self): expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index"), - columns=Index(["series"], dtype=object), + columns=Index(["series"]), ) tm.assert_frame_equal(df, expected) @@ -43,7 +43,7 @@ def test_empty_frame_setitem_index_name_inherited(self): expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index"), - columns=Index(["series"], dtype=object), + columns=Index(["series"]), ) tm.assert_frame_equal(df, expected) @@ -96,9 +96,7 @@ def test_partial_set_empty_frame2(self): # these work as they don't really change # anything but the index # GH#5632 - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="object") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="object")) df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") @@ -116,9 +114,7 @@ def test_partial_set_empty_frame2(self): tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame3(self): - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="int64")) expected["foo"] = expected["foo"].astype("float64") df = DataFrame(index=Index([], dtype="int64")) @@ -135,9 +131,7 @@ def test_partial_set_empty_frame4(self): df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="int64")) # range is int-dtype-like, so we get int64 dtype expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(df, expected) @@ -210,7 +204,7 @@ def test_partial_set_empty_frame_empty_copy_assignment(self): df = DataFrame(index=[0]) df = df.copy() df["a"] = 0 - expected = DataFrame(0, index=[0], columns=Index(["a"], dtype=object)) + expected = DataFrame(0, index=[0], columns=Index(["a"])) tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index d3556b644c4bf..02efd850f9640 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -253,25 +253,17 @@ def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) - expected = Series( - Timedelta("1 days"), dtype="timedelta64[ns]", index=Index(["B"], dtype=object) - ) + expected = Series(Timedelta("1 days"), dtype="timedelta64[ns]", index=["B"]) tm.assert_series_equal(s, expected) s = s.reindex(s.index.insert(0, "A")) expected = Series( - [np.nan, Timedelta("1 days")], - dtype="timedelta64[ns]", - index=Index(["A", "B"], dtype=object), + [np.nan, Timedelta("1 days")], dtype="timedelta64[ns]", index=["A", "B"] ) tm.assert_series_equal(s, expected) s.loc["A"] = timedelta(1) - expected = Series( - Timedelta("1 days"), - dtype="timedelta64[ns]", - index=Index(["A", "B"], dtype=object), - ) + expected = Series(Timedelta("1 days"), dtype="timedelta64[ns]", index=["A", "B"]) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_set_value.py b/pandas/tests/series/indexing/test_set_value.py index 99e71fa4b804b..3bf89da53d923 100644 --- a/pandas/tests/series/indexing/test_set_value.py +++ b/pandas/tests/series/indexing/test_set_value.py @@ -3,17 +3,21 @@ import numpy as np from pandas import ( + DatetimeIndex, Index, Series, ) import pandas._testing as tm -def test_series_set_value(): +def test_series_set_value(using_infer_string): # GH#1561, GH#51363 as of 3.0 we do not do inference in Index.insert dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] - index = Index(dates, dtype=object) + if using_infer_string: + index = DatetimeIndex(dates) + else: + index = Index(dates, dtype=object) s = Series(dtype=object) s._set_value(dates[0], 1.0) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 49c933c308235..964f0b90b3c41 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -474,12 +474,14 @@ def test_setitem_callable_other(self): class TestSetitemWithExpansion: - def test_setitem_empty_series(self): - # GH#10193, GH#51363 changed in 3.0 to not do inference in Index.insert + def test_setitem_empty_series(self, using_infer_string): + # GH#10193 key = Timestamp("2012-01-01") series = Series(dtype=object) series[key] = 47 - expected = Series(47, Index([key], dtype=object)) + expected = Series( + 47, index=[key] if using_infer_string else Index([key], dtype=object) + ) tm.assert_series_equal(series, expected) def test_setitem_empty_series_datetimeindex_preserves_freq(self): @@ -536,10 +538,7 @@ def test_setitem_with_expansion_type_promotion(self): ser["a"] = Timestamp("2016-01-01") ser["b"] = 3.0 ser["c"] = "foo" - expected = Series( - [Timestamp("2016-01-01"), 3.0, "foo"], - index=Index(["a", "b", "c"], dtype=object), - ) + expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) def test_setitem_not_contained(self, string_series): From 249a42f82cbcc745aef677ba5b5482259f74558d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 20:39:38 +0100 Subject: [PATCH 03/10] fixup --- pandas/tests/indexes/test_setops.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 436c862db310e..7cc74f4b3405c 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -530,7 +530,7 @@ def test_intersection_difference_match_empty(self, index, sort): @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) -def test_setop_with_categorical(index_flat, sort, method): +def test_setop_with_categorical(index_flat, sort, method, using_infer_string): # MultiIndex tested separately in tests.indexes.multi.test_setops index = index_flat @@ -539,13 +539,21 @@ def test_setop_with_categorical(index_flat, sort, method): result = getattr(index, method)(other, sort=sort) expected = getattr(index, method)(index, sort=sort) - if index.empty and method in ("union", "symmetric_difference"): + if ( + using_infer_string + and index.empty + and method in ("union", "symmetric_difference") + ): expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) result = getattr(index, method)(other[:5], sort=sort) expected = getattr(index, method)(index[:5], sort=sort) - if index.empty and method in ("union", "symmetric_difference"): + if ( + using_infer_string + and index.empty + and method in ("union", "symmetric_difference") + ): expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) From 513f343e2b5f9d7d176b7c8d1b8c76a1d4335a6d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 20:44:41 +0100 Subject: [PATCH 04/10] fix insert --- pandas/core/indexes/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 167c03cc49596..f4411a20baaa7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6248,8 +6248,10 @@ def _find_common_type_compat(self, target) -> DtypeObj: isinstance(self, RangeIndex) or self.dtype == np.object_ ): return target_dtype - if len(target) == 0 and ( - isinstance(target, RangeIndex) or target_dtype == np.object_ + if ( + isinstance(target, Index) + and len(target) == 0 + and (isinstance(target, RangeIndex) or target_dtype == np.object_) ): return self.dtype From 33ef0d561a1efd078618625b5d75f3535521528b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 20:47:37 +0100 Subject: [PATCH 05/10] more test fixes --- pandas/tests/frame/test_query_eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index ca572b1026526..375b9b00a4988 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -791,7 +791,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): tm.assert_frame_equal(result, expected) expected = DataFrame(df_index) - expected.columns = expected.columns.astype(object) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) From 62cd9b7498a985addba6c8dabd4655707b6551ad Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 21:22:07 +0100 Subject: [PATCH 06/10] more fixes for infer_string mode --- pandas/tests/dtypes/test_concat.py | 6 ++++-- pandas/tests/frame/constructors/test_from_dict.py | 3 --- pandas/tests/frame/test_constructors.py | 3 --- pandas/tests/indexes/base_class/test_setops.py | 1 - pandas/tests/indexes/datetimes/test_join.py | 10 +++++++--- pandas/tests/indexing/test_loc.py | 1 - pandas/tests/series/methods/test_combine_first.py | 5 +++-- 7 files changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index d4fe6c5264007..571e12d0c3303 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -47,7 +47,7 @@ def test_concat_periodarray_2d(): _concat.concat_compat([arr[:2], arr[2:]], axis=1) -def test_concat_series_between_empty_and_tzaware_series(): +def test_concat_series_between_empty_and_tzaware_series(using_infer_string): tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00") ser1 = Series(index=[tzaware_time], data=0, dtype=float) ser2 = Series(dtype=float) @@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series(): data=[ (0.0, None), ], - index=pd.Index([tzaware_time], dtype=object), + index=[tzaware_time] + if using_infer_string + else pd.Index([tzaware_time], dtype=object), columns=[0, 1], dtype=float, ) diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 1509c47ba65c7..845174bbf600e 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -44,7 +42,6 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 9b6080603f0c9..037a2ae294bb2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,8 +21,6 @@ from numpy.ma import mrecords import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -1974,7 +1972,6 @@ def test_constructor_with_datetimes4(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 4038e3b136ceb..d57df82b2358c 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): - # expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py index d0ac32939296c..abf6809d67f9c 100644 --- a/pandas/tests/indexes/datetimes/test_join.py +++ b/pandas/tests/indexes/datetimes/test_join.py @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type): assert isinstance(result, DatetimeIndex) assert result.tz is timezone.utc - def test_datetimeindex_union_join_empty(self, sort): + def test_datetimeindex_union_join_empty(self, sort, using_infer_string): dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") empty = Index([]) result = dti.union(empty, sort=sort) - expected = dti.astype("O") - tm.assert_index_equal(result, expected) + if using_infer_string: + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) + else: + expected = dti.astype("O") + tm.assert_index_equal(result, expected) result = dti.join(empty) assert isinstance(result, DatetimeIndex) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7c8e6026ad27c..17e610bda93e4 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2000,7 +2000,6 @@ def test_loc_setitem_datetime_keys_cast(self, conv, using_infer_string): df.loc[conv(dt1), "one"] = 100 df.loc[conv(dt2), "one"] = 200 - # breakpoint() expected = DataFrame( {"one": [100.0, 200.0]}, index=Index( diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 293919173c2d5..51d6704e1905b 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -31,7 +31,7 @@ def test_combine_first_name(self, datetime_series): result = datetime_series.combine_first(datetime_series[:5]) assert result.name == datetime_series.name - def test_combine_first(self): + def test_combine_first(self, using_infer_string): values = np.arange(20, dtype=np.float64) series = Series(values, index=np.arange(20, dtype=np.int64)) @@ -64,7 +64,8 @@ def test_combine_first(self): ser = Series([1.0, 2, 3], index=[0, 1, 2]) empty = Series([], index=[], dtype=object) result = ser.combine_first(empty) - ser.index = ser.index.astype("O") + if not using_infer_string: + ser.index = ser.index.astype("O") tm.assert_series_equal(result, ser.astype(object)) def test_combine_first_dt64(self, unit): From 152c4ec0859a8acb1e43aaf7b5dea5549bf35715 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 22:24:44 +0100 Subject: [PATCH 07/10] fix feather test for pyarrow<19 --- pandas/tests/io/test_feather.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 24af0a014dd50..e778193c147c1 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -143,8 +143,8 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) From 33f7b5ca407044cf9414ae2a8a26006f30184aa2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 29 Jan 2025 09:29:53 +0100 Subject: [PATCH 08/10] add explicit tests and fix insert for empty object dtype --- pandas/core/indexes/base.py | 9 ++++++ pandas/tests/frame/indexing/test_setitem.py | 20 ++++++++++-- .../tests/frame/methods/test_reset_index.py | 31 +++++++++++++++++++ .../tests/indexes/base_class/test_reshape.py | 2 +- pandas/tests/indexes/test_old_base.py | 10 +++--- 5 files changed, 65 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f4411a20baaa7..06aad7c4e062d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6242,6 +6242,7 @@ def _find_common_type_compat(self, target) -> DtypeObj: # special case: if left or right is a zero-length RangeIndex or # Index[object], those can be created by the default empty constructors # -> for that case ignore this dtype and always return the other + # (https://github.com/pandas-dev/pandas/pull/60797) from pandas.core.indexes.range import RangeIndex if len(self) == 0 and ( @@ -6908,6 +6909,14 @@ def insert(self, loc: int, item) -> Index: arr = self._values + if using_string_dtype and len(self) == 0 and self.dtype == np.object_: + # special case: if we are an empty object-dtype Index, also + # take into account the inserted item for the resulting dtype + # (https://github.com/pandas-dev/pandas/pull/60797) + dtype = self._find_common_type_compat(item) + if dtype != self.dtype: + return self.astype(dtype).insert(loc, item) + try: if isinstance(arr, ExtensionArray): res_values = arr.insert(loc, item) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d0f2eeae62320..20dd7b0c4d3e7 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -144,8 +144,9 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) - def test_setitem_empty_columns(self): - # GH 13522 + def test_setitem_overwrite_index(self): + # GH 13522 - assign the index as a column and then overwrite the values + # -> should not affect the index df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index df["X"] = ["x", "y", "z"] @@ -154,6 +155,21 @@ def test_setitem_empty_columns(self): ) tm.assert_frame_equal(df, exp) + def test_setitem_empty_columns(self): + # Starting from an empty DataFrame and setting a column should result + # in a default string dtype for the columns' Index + # https://github.com/pandas-dev/pandas/issues/60338 + + df = DataFrame() + df["foo"] = [1, 2, 3] + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=Index([])) + df["foo"] = [1, 2, 3] + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(df, expected) + def test_setitem_dt64_index_empty_columns(self): rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") df = DataFrame(index=np.arange(len(rng))) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 80da849cc59d4..80227c0462329 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -778,3 +778,34 @@ def test_reset_index_false_index_name(): result_frame.reset_index() expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_frame_equal(result_frame, expected_frame) + + +@pytest.mark.parametrize("columns", [None, Index([])]) +def test_reset_index_with_empty_frame(columns): + # Currently empty DataFrame has RangeIndex or object dtype Index, but when + # resetting the index we still want to end up with the default string dtype + # https://github.com/pandas-dev/pandas/issues/60338 + + index = Index([], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo"]) + tm.assert_frame_equal(result, expected) + + index = Index([1, 2, 3], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo", "bar"]) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2], "bar": [2, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 56cdca49cb2b0..d4932712de1bb 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -34,7 +34,7 @@ def test_insert(self): # test empty null_index = Index([]) - tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) + tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) def test_insert_missing(self, nulls_fixture, using_infer_string): # GH#22295 diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 65feb07e05d9f..49609d28ca56e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -454,10 +454,12 @@ def test_insert_out_of_bounds(self, index, using_infer_string): else: msg = "slice indices must be integers or None or have an __index__ method" - if using_infer_string and ( - index.dtype == "string" or index.dtype == "category" - ): - msg = "loc must be an integer between" + if using_infer_string: + if index.dtype == "string" or index.dtype == "category": + msg = "loc must be an integer between" + elif index.dtype == "object" and len(index) == 0: + msg = "loc must be an integer between" + err = TypeError with pytest.raises(err, match=msg): index.insert(0.5, "foo") From 46f9fe7a22e1e1d98617f2f3a8d19877cb3cda8d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 29 Jan 2025 20:46:11 +0100 Subject: [PATCH 09/10] fixup --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 06aad7c4e062d..c17c8c1d9a172 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6909,7 +6909,7 @@ def insert(self, loc: int, item) -> Index: arr = self._values - if using_string_dtype and len(self) == 0 and self.dtype == np.object_: + if using_string_dtype() and len(self) == 0 and self.dtype == np.object_: # special case: if we are an empty object-dtype Index, also # take into account the inserted item for the resulting dtype # (https://github.com/pandas-dev/pandas/pull/60797) From 73f56608b448e1618a16a9a1418f2d9e7d08bbee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 15 Feb 2025 17:35:57 +0100 Subject: [PATCH 10/10] add whatsnew note --- doc/source/whatsnew/v2.3.0.rst | 10 ++++++++++ doc/source/whatsnew/v3.0.0.rst | 3 +++ 2 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8bdddb5b7f85d..32d9253326277 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -53,6 +53,16 @@ These are bug fixes that might have notable behavior changes. notable_bug_fix1 ^^^^^^^^^^^^^^^^ +.. _whatsnew_230.api_changes: + +API changes +~~~~~~~~~~~ + +- When enabling the ``future.infer_string`` option: Index set operations (like + union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or + empty ``Index`` with object dtype when determining the dtype of the resulting + Index (:issue:`60797`) + .. --------------------------------------------------------------------------- .. _whatsnew_230.deprecations: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4d9a45abe17cd..64e4a30453366 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -361,6 +361,9 @@ Other API changes - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) - when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`) +- Index set operations (like union or intersection) will now ignore the dtype of + an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining + the dtype of the resulting Index (:issue:`60797`) .. --------------------------------------------------------------------------- .. _whatsnew_300.deprecations: