diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e4c4122f9ff43..21b82c8302eb9 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -374,12 +374,14 @@ class Fillna: params = ( [True, False], - ["pad", "bfill"], + [None, "pad", "bfill"], [ - "float64", - "float32", "object", + "int32", + "int64", "Int64", + "float32", + "float64", "Float64", "datetime64[ns]", "datetime64[ns, tz]", @@ -399,16 +401,26 @@ def setup(self, inplace, method, dtype): "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"), } self.df = DataFrame({f"col_{i}": data[dtype] for i in range(M)}) + self.value = ( + dict(zip(self.df.columns, self.df.sample().iloc[0])) + if not method + else None + ) self.df[::2] = None else: - values = np.random.randn(N, M) - values[::2] = np.nan - if dtype == "Int64": - values = values.round() - self.df = DataFrame(values, dtype=dtype) + data = np.random.randn(N, M) + if dtype in ["int32", "int64", "Int64"]: + data = data.round() + self.df = DataFrame({f"col_{i}": data[i] for i in range(M)}, dtype=dtype) + self.value = ( + dict(zip(self.df.columns, self.df.sample().iloc[0])) + if not method + else None + ) + self.df[::2] = None def time_frame_fillna(self, inplace, method, dtype): - self.df.fillna(inplace=inplace, method=method) + self.df.fillna(value=self.value, inplace=inplace, method=method) class Dropna: diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 155dd6f8e13a0..133bca2ede107 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -6,6 +6,7 @@ NaT, Series, date_range, + timedelta_range, ) from .pandas_vb_common import tm @@ -55,6 +56,55 @@ def time_nsmallest(self, keep): self.s.nsmallest(3, keep=keep) +class Fillna: + + params = ( + [True, False], + [None, "pad", "bfill"], + [ + "object", + "int32", + "int64", + "Int64", + "float32", + "float64", + "Float64", + "datetime64[ns]", + "datetime64[ns, tz]", + "timedelta64[ns]", + ], + ) + param_names = ["inplace", "method", "dtype"] + + def setup(self, inplace, method, dtype): + N = 10000 + if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"): + data = { + "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N), + "datetime64[ns, tz]": date_range( + "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" + ), + "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"), + } + self.ser = Series(data[dtype]) + self.value = ( + dict(zip(self.ser.index, self.ser.values)) if not method else None + ) + self.ser[::2] = None + else: + data = np.random.randn(N) + if dtype in ["int32", "int64", "Int64"]: + data = data.round() + self.ser = Series(data, dtype=dtype) + self.value = ( + dict(zip(self.ser.index, self.ser.values)) if not method else None + ) + self.ser[::2] = None + + def time_series_fillna(self, inplace, method, dtype): + self.ser.fillna(value=self.value, inplace=inplace, method=method) + + class Dropna: params = ["int", "datetime"] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 91a446dd99334..93bbb502363d6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6347,11 +6347,23 @@ def fillna( else: if self.ndim == 1: if isinstance(value, (dict, ABCSeries)): - value = create_series_with_explicit_dtype( + value_map = create_series_with_explicit_dtype( value, dtype_if_empty=object ) - value = value.reindex(self.index, copy=False) - value = value._values + # GH#40498 objects can have multiple types of missing values which + # should not be modified unless specified. Add special casing to + # minimize performance decrease on other data types where this is + # not required. + if is_object_dtype(self.dtype): + value = self.copy() + modification_index = value.index.intersection(value_map.index) + if not modification_index.empty: + value.loc[modification_index] = value_map[ + modification_index + ] + else: + value = value_map.reindex(self.index) + value = value._values elif not is_list_like(value): pass else: diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 8361ec6c6b5fa..de95aae31f852 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -623,6 +623,41 @@ def test_fillna_numeric_inplace(self): expected = x.fillna(value=0) tm.assert_series_equal(y, expected) + def test_fillna_object_null_value_replacement( + self, frame_or_series, unique_nulls_fixture, unique_nulls_fixture2 + ): + # GH#40498 + ser = Series([1, unique_nulls_fixture, "three"]) + obj = frame_or_series(ser) + + if unique_nulls_fixture2 is not None: + value = unique_nulls_fixture2 + else: + pytest.skip(f"{unique_nulls_fixture2} cannot be passed to fillna.") + + result = obj.fillna(value) + + expected = Series([1, unique_nulls_fixture2, "three"]) + expected = frame_or_series(expected) + + tm.assert_equal(result, expected) + + def test_fillna_object_other_missing_values_not_modified( + self, unique_nulls_fixture, unique_nulls_fixture2, frame_or_series + ): + # GH#40498 + ser = Series([1, unique_nulls_fixture, unique_nulls_fixture2, "four"]) + obj = frame_or_series(ser) + + value = {2: 0} if frame_or_series is Series else {0: {2: 0}} + + result = obj.fillna(value) + + expected = Series([1, unique_nulls_fixture, 0, "four"]) + expected = frame_or_series(expected) + + tm.assert_equal(result, expected) + # --------------------------------------------------------------- # CategoricalDtype