Skip to content

Fix 40420: Interpret NaN in clip() as no bound. #40927

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 23, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,7 @@ Other
- Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`)
- Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`)
- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`)
- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`)

.. ---------------------------------------------------------------------------

Expand Down
45 changes: 41 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7352,6 +7352,14 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace):
threshold = self._constructor(threshold, index=self.index)
else:
threshold = align_method_FRAME(self, threshold, axis, flex=None)[1]

# GH 40420
# In order to ignore nan values in the threshold, set the values in
# subset that correspond to these na values to True. This indicates to the
# final where() to not clip.
if is_list_like(threshold) and threshold.isna().any(axis=None):
subset_kwargs = {"axis": axis} if threshold.ndim != subset.ndim else {}
subset = subset.where(threshold.notna(), True, **subset_kwargs)
return self.where(subset, threshold, axis=axis, inplace=inplace)

@overload
Expand Down Expand Up @@ -7483,10 +7491,12 @@ def clip(
----------
lower : float or array_like, default None
Minimum threshold value. All values below this
threshold will be set to it.
threshold will be set to it. A missing
threshold (e.g `NA`) will not clip the value.
upper : float or array_like, default None
Maximum threshold value. All values above this
threshold will be set to it.
threshold will be set to it. A missing
threshold (e.g `NA`) will not clip the value.
axis : int or str axis name, optional
Align object with lower and upper along the given axis.
inplace : bool, default False
Expand Down Expand Up @@ -7547,6 +7557,25 @@ def clip(
2 0 3
3 6 8
4 5 3

Clips using specific lower threshold per column element, with missing values:

>>> t = pd.Series([2, -4, np.NaN, 6, 3])
>>> t
0 2.0
1 -4.0
2 NaN
3 6.0
4 3.0
dtype: float64

>>> df.clip(t, axis=0)
col_0 col_1
0 9 2
1 -3 -4
2 0 6
3 6 8
4 5 3
"""
inplace = validate_bool_kwarg(inplace, "inplace")

Expand All @@ -7559,9 +7588,17 @@ def clip(
# so ignore
# GH 19992
# numpy doesn't drop a list-like bound containing NaN
if not is_list_like(lower) and np.any(isna(lower)):
isna_lower = isna(lower)
if not is_list_like(lower):
if np.any(isna_lower):
lower = None
elif np.all(isna_lower):
lower = None
if not is_list_like(upper) and np.any(isna(upper)):
isna_upper = isna(upper)
if not is_list_like(upper):
if np.any(isna_upper):
upper = None
elif np.all(isna_upper):
upper = None

# GH 2747 (arguments were reversed)
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/frame/methods/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,25 @@ def test_clip_with_na_args(self, float_frame):
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)

# GH#19992
# GH#19992 and adjusted in GH#40420
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})

result = df.clip(lower=[4, 5, np.nan], axis=0)
expected = DataFrame(
{"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]}
{"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)

result = df.clip(lower=[4, 5, np.nan], axis=1)
expected = DataFrame(
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]}
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]}
)
tm.assert_frame_equal(result, expected)

# GH#40420
data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
df = DataFrame(data)
t = Series([2, -4, np.NaN, 6, 3])
result = df.clip(lower=t, axis=0)
expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]})
tm.assert_frame_equal(result, expected)
9 changes: 7 additions & 2 deletions pandas/tests/series/methods/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,13 @@ def test_clip_with_na_args(self):
tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3]))

# GH#19992
tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan]))
tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1]))
tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3]))
tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1]))

# GH#40420
s = Series([1, 2, 3])
result = s.clip(0, [np.nan, np.nan, np.nan])
tm.assert_series_equal(s, result)

def test_clip_against_series(self):
# GH#6966
Expand Down