Skip to content

BUG: GroupBy.ffill()/bfill() do not return NaN values for NaN groups #36790

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Oct 10, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ Bug fixes
- Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`)
- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`)
- Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`)
- Bug in :meth:`DataFrameGroupBy.ffill` where a ``NaN`` group would return foward-filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`)

.. ---------------------------------------------------------------------------

Expand Down
16 changes: 15 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1847,16 +1847,30 @@ def _fill(self, direction, limit=None):
if limit is None:
limit = -1

return self._get_cythonized_result(
def _nan_group_gets_nan_values(values, *args):
if not self.dropna:
return values
in_nan_group = DataFrame(self.grouper.codes).eq(-1).any()
if in_nan_group.any():
filler = {np.datetime64: np.datetime64("NaT")}.get(
values.dtype.type, np.nan
)
values[in_nan_group] = filler
return values

res = self._get_cythonized_result(
"group_fillna_indexer",
numeric_only=False,
needs_mask=True,
cython_dtype=np.dtype(np.int64),
result_is_index=True,
direction=direction,
limit=limit,
post_processing=_nan_group_gets_nan_values,
)

return res

@Substitution(name="groupby")
def pad(self, limit=None):
"""
Expand Down
70 changes: 70 additions & 0 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,76 @@ def test_ffill_bfill_non_unique_multilevel(func, expected_status):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize("limit", [None, 1])
@pytest.mark.parametrize("method", ["ffill", "bfill", "pad", "backfill"])
@pytest.mark.parametrize("by", ["grp1", ["grp1"], ["grp1", "grp2"]])
@pytest.mark.parametrize("has_nan", [[], ["grp1"], ["grp1", "grp2"]])
def test_pad_handles_nan_groups(dropna, limit, method, by, has_nan):
# GH 34725

# Create two rows with many different dytypes. The first row will be in
# the 'good' group which never has a nan in the grouping column(s). The
# second row will be in the 'bad' grouping which sometimes has a nan in
# the group column(s).
rows = pd.DataFrame(
{
"int": pd.array([1, 2], dtype="Int64"),
"float": [0.1, 0.2],
"bool": pd.array([True, False], dtype="bool"),
"date": [pd.Timestamp(2010, 1, 1), pd.Timestamp(2020, 2, 2)],
"period": pd.array(
[pd.Period("2010-01"), pd.Period("2020-2")], dtype="period[M]"
),
"obj": ["hello", "world"],
"cat": pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
}
)

# Put those rows into a 10-row dataframe at rows 2 and 7. This will
# allows us to ffill and bfill the rows and confirm that our method is
# behaving as expected
ridx = pd.Series([None] * 10)
ridx[2] = 0
ridx[7] = 1
df = rows.reindex(ridx).reset_index(drop=True)

# Add the grouping column(s).
grps = pd.Series(["good"] * 5 + ["bad"] * 5)
if type(by) is list:
grps = pd.concat([grps] * len(by), axis=1)
df[by] = grps

# Our 'has_nan' arg sometimes lists more columns than we are actually
# grouping by (our 'by' arg), i.e. has_nan=['grp1', 'grp2'] when
# by=['grp1']. We can just reduce 'has_nan' to its intersection with 'by'.
by = [by] if type(by) is not list else by
has_nan = list(set(has_nan).intersection(set(by)))

# For the colunms that are in 'has_nan' replace 'bad' with 'nan'
df[has_nan] = df[has_nan].replace("bad", np.nan)

grouped = df.groupby(by=by, dropna=dropna)
result = getattr(grouped, method)(limit=limit)

# If dropna=True and 'bad' has been replaced by 'nan', then the second
# 5 rows will all be nan, which is what we want: the nan group contains
# only nan values
if dropna and (len(has_nan) > 0):
ridx[7] = None

# To get our expected/benchmark output, we ffill/bfill the rows directly
# (not via a groupby), so we don't want limit=None for this part. With 5
# rows per group and the value rows in positions 2&7, we ffill/bfill
# with limit=2. If we use limit=None rows 2&7 will ffill/bfill into the
# other group
lim = 2 if limit is None else limit
ridx = getattr(ridx, method)(limit=lim)
expected = rows.reindex(ridx).reset_index(drop=True)

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("func", [np.any, np.all])
def test_any_all_np_func(func):
# GH 20653
Expand Down