Skip to content

BUG GH#40498 Fillna other missing values not modified #42981

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 21 additions & 9 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,12 +374,14 @@ class Fillna:

params = (
[True, False],
["pad", "bfill"],
[None, "pad", "bfill"],
[
"float64",
"float32",
"object",
"int32",
"int64",
"Int64",
"float32",
"float64",
"Float64",
"datetime64[ns]",
"datetime64[ns, tz]",
Expand All @@ -399,16 +401,26 @@ def setup(self, inplace, method, dtype):
"timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
}
self.df = DataFrame({f"col_{i}": data[dtype] for i in range(M)})
self.value = (
dict(zip(self.df.columns, self.df.sample().iloc[0]))
if not method
else None
)
self.df[::2] = None
else:
values = np.random.randn(N, M)
values[::2] = np.nan
if dtype == "Int64":
values = values.round()
self.df = DataFrame(values, dtype=dtype)
data = np.random.randn(N, M)
if dtype in ["int32", "int64", "Int64"]:
data = data.round()
self.df = DataFrame({f"col_{i}": data[i] for i in range(M)}, dtype=dtype)
self.value = (
dict(zip(self.df.columns, self.df.sample().iloc[0]))
if not method
else None
)
self.df[::2] = None

def time_frame_fillna(self, inplace, method, dtype):
self.df.fillna(inplace=inplace, method=method)
self.df.fillna(value=self.value, inplace=inplace, method=method)


class Dropna:
Expand Down
50 changes: 50 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
NaT,
Series,
date_range,
timedelta_range,
)

from .pandas_vb_common import tm
Expand Down Expand Up @@ -55,6 +56,55 @@ def time_nsmallest(self, keep):
self.s.nsmallest(3, keep=keep)


class Fillna:

params = (
[True, False],
[None, "pad", "bfill"],
[
"object",
"int32",
"int64",
"Int64",
"float32",
"float64",
"Float64",
"datetime64[ns]",
"datetime64[ns, tz]",
"timedelta64[ns]",
],
)
param_names = ["inplace", "method", "dtype"]

def setup(self, inplace, method, dtype):
N = 10000
if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"):
data = {
"datetime64[ns]": date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
}
self.ser = Series(data[dtype])
self.value = (
dict(zip(self.ser.index, self.ser.values)) if not method else None
)
self.ser[::2] = None
else:
data = np.random.randn(N)
if dtype in ["int32", "int64", "Int64"]:
data = data.round()
self.ser = Series(data, dtype=dtype)
self.value = (
dict(zip(self.ser.index, self.ser.values)) if not method else None
)
self.ser[::2] = None

def time_series_fillna(self, inplace, method, dtype):
self.ser.fillna(value=self.value, inplace=inplace, method=method)


class Dropna:

params = ["int", "datetime"]
Expand Down
18 changes: 15 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6347,11 +6347,23 @@ def fillna(
else:
if self.ndim == 1:
if isinstance(value, (dict, ABCSeries)):
value = create_series_with_explicit_dtype(
value_map = create_series_with_explicit_dtype(
value, dtype_if_empty=object
)
value = value.reindex(self.index, copy=False)
value = value._values
# GH#40498 objects can have multiple types of missing values which
# should not be modified unless specified. Add special casing to
# minimize performance decrease on other data types where this is
# not required.
if is_object_dtype(self.dtype):
value = self.copy()
modification_index = value.index.intersection(value_map.index)
if not modification_index.empty:
value.loc[modification_index] = value_map[
modification_index
]
else:
value = value_map.reindex(self.index)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you don't add the copy does this just work e.g. do you still need L6314-6325

Copy link
Author

@ghost ghost Sep 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately the testing method we're using cannot differentiate the following:

result = pd.Series([1, None, 3, "four"])
expected = pd.Series([1, np.nan, 3, "four"])
tm.assert_equal(result, expected)

If you remove L6314-6325 the unit tests will pass but the original problem will still exist. I mentioned this on my initial Pull Request but these tests will not fail because of an open issue - #18463. I probably should've clarified if it's worth writing tests with other bugs in mind, or to write them as if the bug will be fixed eventually.

What do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.reindex should just work here. I am really not sure what you are trying to do.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you update here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback Sorry I might not be following, what did you want me to update here?

value = value._values
elif not is_list_like(value):
pass
else:
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,41 @@ def test_fillna_numeric_inplace(self):
expected = x.fillna(value=0)
tm.assert_series_equal(y, expected)

def test_fillna_object_null_value_replacement(
self, frame_or_series, unique_nulls_fixture, unique_nulls_fixture2
):
# GH#40498
ser = Series([1, unique_nulls_fixture, "three"])
obj = frame_or_series(ser)

if unique_nulls_fixture2 is not None:
value = unique_nulls_fixture2
else:
pytest.skip(f"{unique_nulls_fixture2} cannot be passed to fillna.")

result = obj.fillna(value)

expected = Series([1, unique_nulls_fixture2, "three"])
expected = frame_or_series(expected)

tm.assert_equal(result, expected)

def test_fillna_object_other_missing_values_not_modified(
self, unique_nulls_fixture, unique_nulls_fixture2, frame_or_series
):
# GH#40498
ser = Series([1, unique_nulls_fixture, unique_nulls_fixture2, "four"])
obj = frame_or_series(ser)

value = {2: 0} if frame_or_series is Series else {0: {2: 0}}

result = obj.fillna(value)

expected = Series([1, unique_nulls_fixture, 0, "four"])
expected = frame_or_series(expected)

tm.assert_equal(result, expected)

# ---------------------------------------------------------------
# CategoricalDtype

Expand Down