Skip to content

Fix 40420: Interpret NaN in clip() as no bound. #40927

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,7 @@ Other
- Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`)
- Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`)
- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`)
- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`)

.. ---------------------------------------------------------------------------

Expand Down
51 changes: 45 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7341,8 +7341,6 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace):
return self._clip_with_scalar(None, threshold, inplace=inplace)
return self._clip_with_scalar(threshold, None, inplace=inplace)

subset = method(threshold, axis=axis) | isna(self)

# GH #15390
# In order for where method to work, the threshold must
# be transformed to NDFrame from other array like structure.
Expand All @@ -7351,6 +7349,18 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace):
threshold = self._constructor(threshold, index=self.index)
else:
threshold = align_method_FRAME(self, threshold, axis, flex=None)[1]

# GH 40420
# Treat missing thresholds as no bounds, not clipping the values
if is_list_like(threshold):
fill_value = np.inf if method.__name__ == "le" else -np.inf
threshold_inf = threshold.fillna(fill_value)
Copy link
Contributor

@ms7463 ms7463 Dec 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This causes an issue when working with datetime series (see here: #44785). Since downstream it will try to compare a float to a Timestamp object. There's probably a more robust solution, but a potential quick/easy fix would be to check the dtype here and use an appropriate alternative to np.inf (e.g. pd.Timestamp.max/min).

else:
threshold_inf = threshold

subset = method(threshold_inf, axis=axis) | isna(self)

# GH 40420
return self.where(subset, threshold, axis=axis, inplace=inplace)

@overload
Expand Down Expand Up @@ -7482,10 +7492,12 @@ def clip(
----------
lower : float or array_like, default None
Minimum threshold value. All values below this
threshold will be set to it.
threshold will be set to it. A missing
threshold (e.g `NA`) will not clip the value.
upper : float or array_like, default None
Maximum threshold value. All values above this
threshold will be set to it.
threshold will be set to it. A missing
threshold (e.g `NA`) will not clip the value.
axis : int or str axis name, optional
Align object with lower and upper along the given axis.
inplace : bool, default False
Expand Down Expand Up @@ -7546,6 +7558,25 @@ def clip(
2 0 3
3 6 8
4 5 3

Clips using specific lower threshold per column element, with missing values:

>>> t = pd.Series([2, -4, np.NaN, 6, 3])
>>> t
0 2.0
1 -4.0
2 NaN
3 6.0
4 3.0
dtype: float64

>>> df.clip(t, axis=0)
col_0 col_1
0 9 2
1 -3 -4
2 0 6
3 6 8
4 5 3
"""
inplace = validate_bool_kwarg(inplace, "inplace")

Expand All @@ -7558,9 +7589,17 @@ def clip(
# so ignore
# GH 19992
# numpy doesn't drop a list-like bound containing NaN
if not is_list_like(lower) and np.any(isna(lower)):
isna_lower = isna(lower)
if not is_list_like(lower):
if np.any(isna_lower):
lower = None
elif np.all(isna_lower):
lower = None
if not is_list_like(upper) and np.any(isna(upper)):
isna_upper = isna(upper)
if not is_list_like(upper):
if np.any(isna_upper):
upper = None
elif np.all(isna_upper):
upper = None

# GH 2747 (arguments were reversed)
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/frame/methods/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,25 @@ def test_clip_with_na_args(self, float_frame):
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)

# GH#19992
# GH#19992 and adjusted in GH#40420
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})

result = df.clip(lower=[4, 5, np.nan], axis=0)
expected = DataFrame(
{"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]}
{"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)

result = df.clip(lower=[4, 5, np.nan], axis=1)
expected = DataFrame(
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]}
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)

# GH#40420
data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
df = DataFrame(data)
t = Series([2, -4, np.NaN, 6, 3])
result = df.clip(lower=t, axis=0)
expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]})
tm.assert_frame_equal(result, expected)
9 changes: 7 additions & 2 deletions pandas/tests/series/methods/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,13 @@ def test_clip_with_na_args(self):
tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3]))

# GH#19992
tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan]))
tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1]))
tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3]))
tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1]))

# GH#40420
s = Series([1, 2, 3])
result = s.clip(0, [np.nan, np.nan, np.nan])
tm.assert_series_equal(s, result)

def test_clip_against_series(self):
# GH#6966
Expand Down