Skip to content

ENH: column-wise DataFrame.fillna with Series/Dict value #38352

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ Other enhancements
- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a Series or DataFrame (:issue:`28394`)
- :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`)
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
- :meth:`DataFrame.fillna` can fill NA values column-wise with a dictionary or :class:`Series` (:issue:`4514`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to 1.3

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

- :meth:`io.sql.get_schema` now supports a ``schema`` keyword argument that will add a schema into the create table statement (:issue:`28486`)
- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
- :meth:`DataFrame.hist` now supports time series (datetime) data (:issue:`32590`)
Expand Down
24 changes: 12 additions & 12 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6398,20 +6398,20 @@ def fillna(
)

elif isinstance(value, (dict, ABCSeries)):
if axis == 1:
raise NotImplementedError(
"Currently only can fill "
"with dict/Series column "
"by column"
)
tmp = self if inplace else self.copy()

result = self if inplace else self.copy()
for k, v in value.items():
if k not in result:
for i, (label, content) in enumerate(tmp.items()):
if axis == 0 and label not in value:
continue
obj = result[k]
obj.fillna(v, limit=limit, inplace=True, downcast=downcast)
return result if not inplace else None
tmp.iloc[:, i] = content.fillna(
value[label] if axis == 0 else value,
limit=limit,
inplace=False,
downcast=downcast,
)

tmp = tmp.infer_objects()
new_data = tmp._mgr

elif not is_list_like(value):
new_data = self._mgr.fillna(
Expand Down
95 changes: 91 additions & 4 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,10 +432,6 @@ def test_fillna_dict_series(self):
expected = df.fillna(df.max().to_dict())
tm.assert_frame_equal(result, expected)

# disable this for now
with pytest.raises(NotImplementedError, match="column by column"):
df.fillna(df.max(1), axis=1)

def test_fillna_dataframe(self):
# GH#8377
df = DataFrame(
Expand Down Expand Up @@ -525,6 +521,97 @@ def test_fill_corner(self, float_frame, float_string_frame):
# TODO(wesm): unused?
result = empty_float.fillna(value=0) # noqa

@pytest.mark.parametrize(
"expected,fill_value",
[
(
DataFrame(
[[100, 100], [200, 4], [5, 6]], columns=list("AB"), dtype="float64"
),
Series([100, 200, 300]),
),
(
DataFrame(
[[100, 100], [np.nan, 4], [5, 6]],
columns=list("AB"),
dtype="float64",
),
{0: 100, 2: 300, 3: 400},
),
],
)
def test_fillna_column_wise(self, expected, fill_value):
# GH 4514
df = DataFrame([[np.nan, np.nan], [np.nan, 4], [5, 6]], columns=list("AB"))
result = df.fillna(fill_value, axis=1)
tm.assert_frame_equal(expected, result)

def test_fillna_column_wise_downcast(self):
# GH 4514
df = DataFrame([[np.nan, 2], [3, np.nan], [np.nan, np.nan]], columns=list("AB"))
s = Series([100, 200, 300])

expected = DataFrame(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens if we have a datetime column mixed in here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We end up with object dtype:

In [5]: df = pd.DataFrame([[np.nan, 2], [3, np.nan], [np.nan, np.nan]], columns=list("AB"))
   ...: s = pd.Series(pd.to_datetime([100, 200, 300], unit="ns"))
   ...: 
   ...: result = df.fillna(s, axis=1, downcast="infer")
   ...: result
Out[5]: 
                               A                              B
0  1970-01-01 00:00:00.000000100                            2.0
1                            3.0  1970-01-01 00:00:00.000000200
2  1970-01-01 00:00:00.000000300  1970-01-01 00:00:00.000000300

In [6]: result.dtypes
Out[6]: 
A    object
B    object
dtype: object
In [8]: df = pd.DataFrame({"A": pd.to_datetime([np.nan, 2, np.nan]), "B": pd.to_datetime([3, np.nan, np.nan])})
   ...: s = pd.Series([100, 200, 300])
   ...: 
   ...: result = df.fillna(s, axis=1, downcast="infer")
   ...: result
Out[8]: 
                               A                              B
0                            100  1970-01-01 00:00:00.000000003
1  1970-01-01 00:00:00.000000002                            200
2                            300                            300

In [9]: result.dtypes
Out[9]: 
A    object
B    object

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this what we want?

[[100, 2], [3, 200], [300, 300]], columns=list("AB"), dtype="int64"
)
result = df.fillna(s, axis=1, downcast="infer")
tm.assert_frame_equal(expected, result)

@pytest.mark.parametrize(
"fill_value", [Series([100, 200, 300]), {0: 100, 2: 300, 3: 400}]
)
def test_fillna_column_wise_inplace(self, fill_value):
# GH 4514
df = DataFrame([[np.nan, np.nan], [np.nan, 4], [5, 6]], columns=list("AB"))
expected = df.fillna(fill_value, axis=1, inplace=False)
df.fillna(fill_value, axis=1, inplace=True)
tm.assert_frame_equal(expected, df)

@pytest.mark.parametrize(
"fill_value",
[Series([100, 200, 300], index=[0, 1, 2]), {0: 100, 1: 200, 2: 300}],
)
def test_fillna_column_wise_duplicated_with_series_dict(self, fill_value):
# GH 4514
df = DataFrame(
[[np.nan, np.nan, 3], [np.nan, 5, np.nan], [7, np.nan, np.nan]],
columns=list("ABB"),
index=[0, 0, 1],
)
expected = DataFrame(
[[100, 100, 3], [100, 5, 100], [7, 200, 200]],
columns=list("ABB"),
index=[0, 0, 1],
dtype="float64",
)

result = df.fillna(fill_value, axis=1)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"fill_value",
[
Series([100, 200, 300], index=["A", "B", "C"]),
{"A": 100, "B": 200, "C": 300},
],
)
def test_fillna_duplicated_with_series_dict(self, fill_value):
# GH 4514
df = DataFrame(
[[np.nan, np.nan, 3], [np.nan, 5, np.nan], [7, np.nan, np.nan]],
columns=list("ABB"),
index=[0, 0, 1],
)
expected = DataFrame(
[[100, 200, 3], [100, 5, 200], [7, 200, 200]],
columns=list("ABB"),
index=[0, 0, 1],
dtype="float64",
)

result = df.fillna(fill_value)
tm.assert_frame_equal(result, expected)


def test_fillna_nonconsolidated_frame():
# https://github.com/pandas-dev/pandas/issues/36495
Expand Down