From 9ef9f9e85bf9ee5ed2575f9840701cc6ebf3d2fe Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sat, 18 Apr 2020 15:41:51 +0900 Subject: [PATCH 1/8] BUG: pd.Series.replace does not preserve the original dtype --- pandas/core/internals/blocks.py | 11 +++++++++-- pandas/tests/series/methods/test_replace.py | 18 +++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ebdc331a673ab..4ad96144e7a47 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1005,8 +1005,15 @@ def coerce_to_target_dtype(self, other): we can also safely try to coerce to the same dtype and will receive the same block """ - # if we cannot then coerce to object - dtype, _ = infer_dtype_from(other, pandas_dtype=True) + if type(other) == str: + dtype = "string" + if is_dtype_equal(self.dtype, dtype): + return self + else: + dtype = np.object_ + else: + # if we cannot then coerce to object + dtype, _ = infer_dtype_from(other, pandas_dtype=True) if is_dtype_equal(self.dtype, dtype): return self diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 1c54e2b988219..11e414f3cbc8d 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -254,7 +254,7 @@ def test_replace2(self): def test_replace_with_dictlike_and_string_dtype(self): # GH 32621 s = pd.Series(["one", "two", np.nan], dtype="string") - expected = pd.Series(["1", "2", np.nan]) + expected = pd.Series(["1", "2", np.nan], dtype="string") result = s.replace({"one": "1", "two": "2"}) tm.assert_series_equal(expected, result) @@ -406,3 +406,19 @@ def test_replace_only_one_dictlike_arg(self): msg = "Series.replace cannot use dict-value and non-None to_replace" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) + + @pytest.mark.parametrize( + "series, to_replace, expected", + [ + ( + pd.Series(["one", "two"], dtype="string"), + {"one": "1", "two": "2"}, + "string", + ), + (pd.Series([1, 2], dtype="int64"), {1: 10, 2: 20}, "int64"), + (pd.Series([True, False], dtype="bool"), {True: False}, "bool"), + ], + ) + def test_replace_dtype(self, series, to_replace, expected): + result = str(series.replace(to_replace).dtype) + assert expected == result From 8be0127fed99fe53c1f7a4863ba882382957bd4a Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sun, 19 Apr 2020 01:35:06 +0900 Subject: [PATCH 2/8] Changes according to comments --- pandas/core/internals/blocks.py | 2 +- pandas/tests/series/methods/test_replace.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4ad96144e7a47..3090b3770b52f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1005,7 +1005,7 @@ def coerce_to_target_dtype(self, other): we can also safely try to coerce to the same dtype and will receive the same block """ - if type(other) == str: + if isinstance(other, str): dtype = "string" if is_dtype_equal(self.dtype, dtype): return self diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 11e414f3cbc8d..341160f51fdf1 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -420,5 +420,6 @@ def test_replace_only_one_dictlike_arg(self): ], ) def test_replace_dtype(self, series, to_replace, expected): + # GH 33484 result = str(series.replace(to_replace).dtype) assert expected == result From 078aca119065f5e87fb1055036e1f8a176cd99ab Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Wed, 22 Apr 2020 21:57:39 +0900 Subject: [PATCH 3/8] Remove hard-code --- pandas/core/internals/blocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3090b3770b52f..6967d51d44835 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -45,6 +45,7 @@ is_re, is_re_compilable, is_sparse, + is_string_dtype, is_timedelta64_dtype, pandas_dtype, ) @@ -1006,8 +1007,7 @@ def coerce_to_target_dtype(self, other): and will receive the same block """ if isinstance(other, str): - dtype = "string" - if is_dtype_equal(self.dtype, dtype): + if is_string_dtype(self.dtype): return self else: dtype = np.object_ From 571ab8ac150d45a724bf39f1275e23659e4387ce Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Thu, 23 Apr 2020 21:36:26 +0900 Subject: [PATCH 4/8] Update infer_dtype_from --- pandas/core/dtypes/cast.py | 4 +++- pandas/core/internals/blocks.py | 10 ++-------- pandas/tests/series/methods/test_replace.py | 17 +++++++++++------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7dda6850ba4f7..1915846036be4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -684,7 +684,9 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False): # instead of np.empty (but then you still don't want things # coming out as np.str_! - dtype = np.object_ + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype() elif isinstance(val, (np.datetime64, datetime)): val = tslibs.Timestamp(val) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6967d51d44835..685b5b41fcced 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1006,14 +1006,8 @@ def coerce_to_target_dtype(self, other): we can also safely try to coerce to the same dtype and will receive the same block """ - if isinstance(other, str): - if is_string_dtype(self.dtype): - return self - else: - dtype = np.object_ - else: - # if we cannot then coerce to object - dtype, _ = infer_dtype_from(other, pandas_dtype=True) + # if we cannot then coerce to object + dtype, _ = infer_dtype_from(other, pandas_dtype=True) if is_dtype_equal(self.dtype, dtype): return self diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 341160f51fdf1..6a7a09aa852e4 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -278,7 +278,7 @@ def test_replace_replacer_equals_replacement(self): # GH 20656 # make sure all replacers are matching against original values s = pd.Series(["a", "b"]) - expected = pd.Series(["b", "a"]) + expected = pd.Series(["b", "a"], dtype="string") result = s.replace({"a": "b", "b": "a"}) tm.assert_series_equal(expected, result) @@ -350,19 +350,24 @@ def test_replace_with_no_overflowerror(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "ser, to_replace, exp", + "ser, to_replace, exp, dtype", [ - ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]), - (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]), + ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4], "int64"), + ( + ["1", "2", "3"], + {"1": "2", "2": "3", "3": "4"}, + ["2", "3", "4"], + "string", + ), ], ) - def test_replace_commutative(self, ser, to_replace, exp): + def test_replace_commutative(self, ser, to_replace, exp, dtype): # GH 16051 # DataFrame.replace() overwrites when values are non-numeric series = pd.Series(ser) - expected = pd.Series(exp) + expected = pd.Series(exp, dtype=dtype) result = series.replace(to_replace) tm.assert_series_equal(result, expected) From b227024839e2e416c1354da6f6d4dc132b941b72 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Fri, 1 May 2020 23:49:14 +0900 Subject: [PATCH 5/8] =?UTF-8?q?Move=20conditional=20branch=E3=80=80out=20o?= =?UTF-8?q?f=20infer=5Fdtype=5Ffrom()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/core/dtypes/cast.py | 4 +--- pandas/core/internals/blocks.py | 5 ++++- pandas/tests/series/methods/test_replace.py | 17 ++++++----------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 393d0764d3d3d..e50d635a1ba6c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -660,9 +660,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, # instead of np.empty (but then you still don't want things # coming out as np.str_! - from pandas.core.arrays.string_ import StringDtype - - dtype = StringDtype() + dtype = np.dtype(object) elif isinstance(val, (np.datetime64, datetime)): val = tslibs.Timestamp(val) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c44cad6792788..b7952037c3dd2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -45,7 +45,6 @@ is_re, is_re_compilable, is_sparse, - is_string_dtype, is_timedelta64_dtype, pandas_dtype, ) @@ -73,6 +72,7 @@ PandasDtype, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -1008,6 +1008,9 @@ def coerce_to_target_dtype(self, other): # if we cannot then coerce to object dtype, _ = infer_dtype_from(other, pandas_dtype=True) + if isinstance(self.dtype, StringDtype): + dtype = StringDtype() + if is_dtype_equal(self.dtype, dtype): return self diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index a092bab8e2d11..d7c8e372320ff 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -274,7 +274,7 @@ def test_replace_replacer_equals_replacement(self): # GH 20656 # make sure all replacers are matching against original values s = pd.Series(["a", "b"]) - expected = pd.Series(["b", "a"], dtype="string") + expected = pd.Series(["b", "a"]) result = s.replace({"a": "b", "b": "a"}) tm.assert_series_equal(expected, result) @@ -346,24 +346,19 @@ def test_replace_with_no_overflowerror(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "ser, to_replace, exp, dtype", + "ser, to_replace, exp", [ - ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4], "int64"), - ( - ["1", "2", "3"], - {"1": "2", "2": "3", "3": "4"}, - ["2", "3", "4"], - "string", - ), + ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]), + (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]), ], ) - def test_replace_commutative(self, ser, to_replace, exp, dtype): + def test_replace_commutative(self, ser, to_replace, exp): # GH 16051 # DataFrame.replace() overwrites when values are non-numeric series = pd.Series(ser) - expected = pd.Series(exp, dtype=dtype) + expected = pd.Series(exp) result = series.replace(to_replace) tm.assert_series_equal(result, expected) From a62c8969279e83c8bb7c7ab5c457a7357d83fb21 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sun, 3 May 2020 13:11:17 +0900 Subject: [PATCH 6/8] Remove specifically handling --- pandas/core/internals/blocks.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b7952037c3dd2..948264345f82e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -72,7 +72,6 @@ PandasDtype, TimedeltaArray, ) -from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -1008,13 +1007,17 @@ def coerce_to_target_dtype(self, other): # if we cannot then coerce to object dtype, _ = infer_dtype_from(other, pandas_dtype=True) - if isinstance(self.dtype, StringDtype): - dtype = StringDtype() - if is_dtype_equal(self.dtype, dtype): return self - if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): + if ( + is_extension_array_dtype(self.dtype) + and not is_categorical_dtype(self.dtype) + and not self.is_datetime + ): + dtype = self.dtype + + elif self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): # we don't upcast to bool return self.astype(object) From 1a9fa40b75ac66346978ca13f666f90ef63f9c74 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sat, 30 May 2020 17:22:01 +0900 Subject: [PATCH 7/8] add test cases --- pandas/tests/series/methods/test_replace.py | 32 +++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index d7c8e372320ff..3c34971909f2d 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -413,6 +413,38 @@ def test_replace_only_one_dictlike_arg(self): ), (pd.Series([1, 2], dtype="int64"), {1: 10, 2: 20}, "int64"), (pd.Series([True, False], dtype="bool"), {True: False}, "bool"), + ( + pd.Series(pd.interval_range(0, 5), dtype=pd.IntervalDtype("int64")), + {pd.Interval(0, 1): pd.Interval(10, 20)}, + "interval[int64]", + ), + ( + pd.Series(pd.interval_range(0, 1), dtype=pd.IntervalDtype("float64")), + {pd.Interval(0.0, 1.0): pd.Interval(0.2, 0.3)}, + "interval[float64]", + ), + ( + pd.Series([pd.Period("2020-05", freq="M")], dtype=pd.PeriodDtype("M")), + {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, + "period[M]", + ), + ( + pd.Series( + pd.arrays.DatetimeArray( + np.array( + ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], + dtype="M8[ns]", + ), + dtype=pd.DatetimeTZDtype(tz="US/Central"), + ) + ), + { + pd.Timestamp( + "2000-01-01 06:00:00-0600", tz="US/Central" + ): pd.Timestamp("2000-01-01 12:00:00-0600", tz="US/Central") + }, + "datetime64[ns, US/Central]", + ), ], ) def test_replace_dtype(self, series, to_replace, expected): From b777345de286557e660903f050888b699e80053e Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sat, 30 May 2020 17:25:08 +0900 Subject: [PATCH 8/8] move is_categorical_dtype() --- pandas/core/internals/blocks.py | 8 ++++---- pandas/tests/series/methods/test_replace.py | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 948264345f82e..ad2599f765380 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1010,10 +1010,8 @@ def coerce_to_target_dtype(self, other): if is_dtype_equal(self.dtype, dtype): return self - if ( - is_extension_array_dtype(self.dtype) - and not is_categorical_dtype(self.dtype) - and not self.is_datetime + if is_extension_array_dtype(self.dtype) and not is_categorical_dtype( + self.dtype ): dtype = self.dtype @@ -1060,6 +1058,8 @@ def coerce_to_target_dtype(self, other): raise AssertionError( f"possible recursion in coerce_to_target_dtype: {self} {other}" ) + if is_categorical_dtype(dtype) or self.is_datetime: + return self.astype(object) try: return self.astype(dtype) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 3c34971909f2d..de1674d8a9ce6 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,6 +3,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import IntervalArray class TestSeriesReplace: @@ -414,13 +415,13 @@ def test_replace_only_one_dictlike_arg(self): (pd.Series([1, 2], dtype="int64"), {1: 10, 2: 20}, "int64"), (pd.Series([True, False], dtype="bool"), {True: False}, "bool"), ( - pd.Series(pd.interval_range(0, 5), dtype=pd.IntervalDtype("int64")), - {pd.Interval(0, 1): pd.Interval(10, 20)}, + pd.Series(IntervalArray.from_breaks([1, 2, 3, 4]), dtype=pd.IntervalDtype("int64")), + {pd.Interval(1, 2): pd.Interval(10, 20)}, "interval[int64]", ), ( - pd.Series(pd.interval_range(0, 1), dtype=pd.IntervalDtype("float64")), - {pd.Interval(0.0, 1.0): pd.Interval(0.2, 0.3)}, + pd.Series(IntervalArray.from_breaks([1, 2, 3, 4]), dtype=pd.IntervalDtype("float64")), + {pd.Interval(1, 2): pd.Interval(0.2, 0.3)}, "interval[float64]", ), ( @@ -449,5 +450,5 @@ def test_replace_only_one_dictlike_arg(self): ) def test_replace_dtype(self, series, to_replace, expected): # GH 33484 - result = str(series.replace(to_replace).dtype) + result = series.replace(to_replace).dtype assert expected == result