Skip to content

BUG GH#40498 Fillna other missing values not modified #42981

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,13 +374,13 @@ class Fillna:

params = (
[True, False],
["pad", "bfill"],
[None, "pad", "bfill"],
[
"float64",
"float32",
"object",
"Int64",
"Float64",
"int32",
"int64",
"float32",
"float64",
"datetime64[ns]",
"datetime64[ns, tz]",
"timedelta64[ns]",
Expand All @@ -400,15 +400,18 @@ def setup(self, inplace, method, dtype):
}
self.df = DataFrame({f"col_{i}": data[dtype] for i in range(M)})
self.df[::2] = None
self.value = (
dict(zip(self.df.columns, data[dtype][:M])) if not method else None
)
else:
values = np.random.randn(N, M)
values[::2] = np.nan
if dtype == "Int64":
values = values.round()
self.df = DataFrame(values, dtype=dtype)
data = np.random.randn(N, M)
data = data.astype(dtype)
self.df = DataFrame({f"col_{i}": data[i] for i in range(M)})
self.df[::2] = None
self.value = dict(zip(self.df.columns, data[M])) if not method else None

def time_frame_fillna(self, inplace, method, dtype):
self.df.fillna(inplace=inplace, method=method)
self.df.fillna(value=self.value, inplace=inplace, method=method)


class Dropna:
Expand Down
43 changes: 43 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
NaT,
Series,
date_range,
timedelta_range,
)

from .pandas_vb_common import tm
Expand Down Expand Up @@ -42,6 +43,48 @@ def time_nsmallest(self, keep):
self.s.nsmallest(3, keep=keep)


class Fillna:

params = (
[True, False],
[None, "pad", "bfill"],
[
"object",
"int32",
"int64",
"float32",
"float64",
"datetime64[ns]",
"datetime64[ns, tz]",
"timedelta64[ns]",
],
)
param_names = ["inplace", "method", "dtype"]

def setup(self, inplace, method, dtype):
N = 10000
if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"):
data = {
"datetime64[ns]": date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
}
self.ser = Series(data[dtype])
self.ser[::2] = None
self.value = dict(zip(self.ser.index, data[dtype])) if not method else None
else:
data = np.random.randn(N)
data = data.astype(dtype)
self.ser = Series(data, dtype=dtype)
self.ser[::2] = None
self.value = dict(zip(self.ser.index, data)) if not method else None

def time_series_fillna(self, inplace, method, dtype):
self.ser.fillna(value=self.value, inplace=inplace, method=method)


class Dropna:

params = ["int", "datetime"]
Expand Down
18 changes: 15 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6308,11 +6308,23 @@ def fillna(
else:
if self.ndim == 1:
if isinstance(value, (dict, ABCSeries)):
value = create_series_with_explicit_dtype(
value_map = create_series_with_explicit_dtype(
value, dtype_if_empty=object
)
value = value.reindex(self.index, copy=False)
value = value._values
# GH#40498 objects can have multiple types of missing values which
# should not be modified unless specified. Add special casing to
# minimize performance decrease on other data types where this is
# not required.
if is_object_dtype(self.dtype):
value = self.copy()
modification_index = value.index.intersection(value_map.index)
if not modification_index.empty:
value.loc[modification_index] = value_map[
modification_index
]
else:
value = value_map.reindex(self.index, copy=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why can't you just set copy=True for object type?

Copy link
Author

@ghost ghost Sep 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the original logic that was present for all value dictionaries before I added the separate object type conditional.

I'm not sure why they chose to explicitly set copy=False since in both the old and new code we're replacing value with the reindexed version and the original value is not used anywhere else. In addition, unless every index is a key in the value dictionary I believe this will have no effect because - "A new object is produced unless the new index is equivalent to the current one and copy=False".

Will remove the unnecessary argument and return to the default since there's no clear value for it.

value = value._values
elif not is_list_like(value):
pass
else:
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,35 @@ def test_fillna_numeric_inplace(self):
expected = x.fillna(value=0)
tm.assert_series_equal(y, expected)

def test_fillna_null_value_replacement(self, frame_or_series):
# GH#40498
ser = Series([None, None])
obj = frame_or_series(ser)

value = np.nan
result = obj.fillna(value)

expected = Series([np.nan, np.nan])
expected = frame_or_series(expected)

tm.assert_equal(result, expected)

def test_fillna_other_missing_values_not_modified(
self, unique_nulls_fixture, unique_nulls_fixture2, frame_or_series
):
# GH#40498
ser = Series([1, unique_nulls_fixture, unique_nulls_fixture2, "four"])
obj = frame_or_series(ser)

value = {2: 0} if frame_or_series is Series else {0: {2: 0}}

result = obj.fillna(value)

expected = Series([1, unique_nulls_fixture, 0, "four"])
expected = frame_or_series(expected)

tm.assert_equal(result, expected)

# ---------------------------------------------------------------
# CategoricalDtype

Expand Down