From 66f833102b20eeca8af65a87d182f3c8230713d3 Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Mon, 12 Apr 2021 11:11:39 +0200 Subject: [PATCH 01/13] Code fix and test --- pandas/core/generic.py | 47 ++++++++++++++++++++---- pandas/tests/series/methods/test_clip.py | 9 ++++- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 65adc258a9b69..6ff80454017a0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -109,7 +109,10 @@ ) import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray -from pandas.core.base import PandasObject +from pandas.core.base import ( + PandasObject, + SelectionMixin, +) import pandas.core.common as com from pandas.core.construction import ( create_series_with_explicit_dtype, @@ -184,7 +187,7 @@ bool_t = bool # Need alias because NDFrame has def bool: -class NDFrame(PandasObject, indexing.IndexingMixin): +class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a size-mutable, labeled data structure @@ -681,6 +684,18 @@ def size(self) -> int: # error: Incompatible return value type (got "number", expected "int") return np.prod(self.shape) # type: ignore[return-value] + @final + @property + def _selected_obj(self: FrameOrSeries) -> FrameOrSeries: + """ internal compat with SelectionMixin """ + return self + + @final + @property + def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries: + """ internal compat with SelectionMixin """ + return self + @overload def set_axis( self: FrameOrSeries, labels, axis: Axis = ..., inplace: Literal[False] = ... @@ -7350,6 +7365,12 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): threshold = self._constructor(threshold, index=self.index) else: threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] + + # GH 40420 + # In order to ignore nan values in the threshold, replace these values with the + # values from the original frame/series. + if is_list_like(threshold) and threshold.isna().any(axis=None): + threshold.where(threshold.notna(), self, inplace=True) return self.where(subset, threshold, axis=axis, inplace=inplace) @final @@ -7449,10 +7470,20 @@ def clip( # so ignore # GH 19992 # numpy doesn't drop a list-like bound containing NaN - if not is_list_like(lower) and np.any(isna(lower)): - lower = None - if not is_list_like(upper) and np.any(isna(upper)): - upper = None + isna_lower = isna(lower) + if not is_list_like(lower): + if np.any(isna_lower): + lower = None + else: + if np.all(isna_lower): + lower = None + isna_upper = isna(upper) + if not is_list_like(upper): + if np.any(isna_upper): + upper = None + else: + if np.all(isna_upper): + upper = None # GH 2747 (arguments were reversed) if ( @@ -10843,7 +10874,7 @@ def median( @doc( _num_doc, desc="Return the maximum of the values over the requested axis.\n\n" - "If you want the *index* of the maximum, use ``idxmax``. This is " + "If you want the *index* of the maximum, use ``idxmax``. This is" "the equivalent of the ``numpy.ndarray`` method ``argmax``.", name1=name1, name2=name2, @@ -10860,7 +10891,7 @@ def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): @doc( _num_doc, desc="Return the minimum of the values over the requested axis.\n\n" - "If you want the *index* of the minimum, use ``idxmin``. This is " + "If you want the *index* of the minimum, use ``idxmin``. This is" "the equivalent of the ``numpy.ndarray`` method ``argmin``.", name1=name1, name2=name2, diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 528e95f65c8f4..442718d677101 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -49,8 +49,13 @@ def test_clip_with_na_args(self): tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH#19992 - tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) - tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) + tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3])) + tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1])) + + # GH#40420 + s = Series([1, 2, 3]) + result = s.clip(0, [np.nan, np.nan, np.nan]) + tm.assert_series_equal(s, result) def test_clip_against_series(self): # GH#6966 From 7b0b69cd03303a577e62b77fdfac480860ddc16c Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Tue, 13 Apr 2021 17:25:33 +0200 Subject: [PATCH 02/13] Update fix and tests. --- pandas/core/generic.py | 2 +- pandas/tests/frame/methods/test_clip.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6ff80454017a0..57b65d85d6de2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7370,7 +7370,7 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # In order to ignore nan values in the threshold, replace these values with the # values from the original frame/series. if is_list_like(threshold) and threshold.isna().any(axis=None): - threshold.where(threshold.notna(), self, inplace=True) + threshold = threshold.where(threshold.notna(), self, inplace=False) return self.where(subset, threshold, axis=axis, inplace=inplace) @final diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 8a2374a414482..c22fddfdf7434 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -149,12 +149,12 @@ def test_clip_with_na_args(self, float_frame): result = df.clip(lower=[4, 5, np.nan], axis=0) expected = DataFrame( - {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} + {"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]} ) tm.assert_frame_equal(result, expected) result = df.clip(lower=[4, 5, np.nan], axis=1) expected = DataFrame( - {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]} ) tm.assert_frame_equal(result, expected) From 768a5eb8ed5c46877ac005803a60bcdffe3783ae Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Tue, 13 Apr 2021 17:33:01 +0200 Subject: [PATCH 03/13] Fix bad merge --- pandas/core/generic.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ef2f6b5405c3..8c215feba23dc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -109,10 +109,7 @@ ) import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray -from pandas.core.base import ( - PandasObject, - SelectionMixin, -) +from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import ( create_series_with_explicit_dtype, @@ -187,7 +184,7 @@ bool_t = bool # Need alias because NDFrame has def bool: -class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): +class NDFrame(PandasObject, indexing.IndexingMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a size-mutable, labeled data structure @@ -684,18 +681,6 @@ def size(self) -> int: # error: Incompatible return value type (got "number", expected "int") return np.prod(self.shape) # type: ignore[return-value] - @final - @property - def _selected_obj(self: FrameOrSeries) -> FrameOrSeries: - """ internal compat with SelectionMixin """ - return self - - @final - @property - def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries: - """ internal compat with SelectionMixin """ - return self - @overload def set_axis( self: FrameOrSeries, labels, axis: Axis = ..., inplace: Literal[False] = ... @@ -11017,7 +11002,7 @@ def median( @doc( _num_doc, desc="Return the maximum of the values over the requested axis.\n\n" - "If you want the *index* of the maximum, use ``idxmax``. This is" + "If you want the *index* of the maximum, use ``idxmax``. This is " "the equivalent of the ``numpy.ndarray`` method ``argmax``.", name1=name1, name2=name2, @@ -11034,7 +11019,7 @@ def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): @doc( _num_doc, desc="Return the minimum of the values over the requested axis.\n\n" - "If you want the *index* of the minimum, use ``idxmin``. This is" + "If you want the *index* of the minimum, use ``idxmin``. This is " "the equivalent of the ``numpy.ndarray`` method ``argmin``.", name1=name1, name2=name2, From 66708d2f8905ed73b6ca0c277be927ae46bac44d Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Thu, 15 Apr 2021 17:03:44 +0200 Subject: [PATCH 04/13] Improve fix and add tests. --- pandas/core/generic.py | 45 +++++++++++++++++++------ pandas/tests/frame/methods/test_clip.py | 12 +++++-- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8c215feba23dc..837d11f2a7c78 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7352,10 +7352,12 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] # GH 40420 - # In order to ignore nan values in the threshold, replace these values with the - # values from the original frame/series. + # In order to ignore nan values in the threshold, set the values in + # subset that correspond to these na values to True. This indicates to the + # final where() to not clip. if is_list_like(threshold) and threshold.isna().any(axis=None): - threshold = threshold.where(threshold.notna(), self, inplace=False) + subset_kwargs = {"axis": axis} if threshold.ndim != subset.ndim else {} + subset = subset.where(threshold.notna(), True, **subset_kwargs) return self.where(subset, threshold, axis=axis, inplace=inplace) @overload @@ -7487,10 +7489,12 @@ def clip( ---------- lower : float or array_like, default None Minimum threshold value. All values below this - threshold will be set to it. + threshold will be set to it. A missing + threshold (e.g `NA`) will not clip the value. upper : float or array_like, default None Maximum threshold value. All values above this - threshold will be set to it. + threshold will be set to it. A missing + threshold (e.g `NA`) will not clip the value. axis : int or str axis name, optional Align object with lower and upper along the given axis. inplace : bool, default False @@ -7551,6 +7555,27 @@ def clip( 2 0 3 3 6 8 4 5 3 + + Clips using specific lower threshold per column element, with missing values: + + >>> t = pd.Series([2, -4, np.NaN, 6, 3]) + >>> t + 0 2 + 1 -4 + 2 + 3 6 + 4 3 + dtype: object + + >>> df.clip(t, axis=0) + col_0 col_1 + 0 9 2 + 1 -3 -4 + 2 0 6 + 3 6 8 + 4 5 3 + + """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -7567,16 +7592,14 @@ def clip( if not is_list_like(lower): if np.any(isna_lower): lower = None - else: - if np.all(isna_lower): - lower = None + elif np.all(isna_lower): + lower = None isna_upper = isna(upper) if not is_list_like(upper): if np.any(isna_upper): upper = None - else: - if np.all(isna_upper): - upper = None + elif np.all(isna_upper): + upper = None # GH 2747 (arguments were reversed) if ( diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index c22fddfdf7434..cee1c7d6c22c3 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -140,11 +140,11 @@ def test_clip_against_unordered_columns(self): def test_clip_with_na_args(self, float_frame): """Should process np.nan argument as None """ - # GH#17276 + # GH#17276 an adjusted in GH#40420 tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) - # GH#19992 + # GH#19992 and adjusted in GH#40420 df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) result = df.clip(lower=[4, 5, np.nan], axis=0) @@ -158,3 +158,11 @@ def test_clip_with_na_args(self, float_frame): {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]} ) tm.assert_frame_equal(result, expected) + + # GH#40420 + data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} + df = DataFrame(data) + t = Series([2, -4, np.NaN, 6, 3]) + result = df.clip(lower=t, axis=0) + expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) + tm.assert_frame_equal(result, expected) From e7acdeb3daf0500c77c2f1bb3c2bb0e53c1709b3 Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Thu, 15 Apr 2021 17:06:25 +0200 Subject: [PATCH 05/13] Small adjustments --- pandas/core/generic.py | 1 - pandas/tests/frame/methods/test_clip.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d21de3370bb16..dc9a71279ca38 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7577,7 +7577,6 @@ def clip( 3 6 8 4 5 3 - """ inplace = validate_bool_kwarg(inplace, "inplace") diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index cee1c7d6c22c3..6525109da4394 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -140,7 +140,7 @@ def test_clip_against_unordered_columns(self): def test_clip_with_na_args(self, float_frame): """Should process np.nan argument as None """ - # GH#17276 an adjusted in GH#40420 + # GH#17276 tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) From 0bcb70d1ff781751460b7ca29228f5b31cb3becf Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Thu, 15 Apr 2021 17:10:46 +0200 Subject: [PATCH 06/13] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1c7942dfedafa..5fcacc9b049fc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -855,6 +855,7 @@ Other - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`) - Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`) +- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) .. --------------------------------------------------------------------------- From 4c2b3c597224e7b98d66989bb7fd3d84ec59ea7b Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Thu, 15 Apr 2021 18:17:30 +0200 Subject: [PATCH 07/13] Remove empty line at end of docstring. --- pandas/core/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dc9a71279ca38..43023a7dc8d24 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7576,7 +7576,6 @@ def clip( 2 0 6 3 6 8 4 5 3 - """ inplace = validate_bool_kwarg(inplace, "inplace") From 751c8a9dcabe75603e142913d377aff17a545f87 Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Thu, 15 Apr 2021 20:58:50 +0200 Subject: [PATCH 08/13] Fix doctest dtype --- pandas/core/generic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 43023a7dc8d24..be74807d0ea55 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7562,12 +7562,12 @@ def clip( >>> t = pd.Series([2, -4, np.NaN, 6, 3]) >>> t - 0 2 - 1 -4 - 2 - 3 6 - 4 3 - dtype: object + 0 2.0 + 1 -4.0 + 2 NaN + 3 6.0 + 4 3.0 + dtype: float64 >>> df.clip(t, axis=0) col_0 col_1 From c84df442f4dd57d8069a0b7d59a787e0a1e3ddff Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Fri, 16 Apr 2021 04:11:59 +0200 Subject: [PATCH 09/13] Use inf to replace NA --- pandas/core/generic.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index be74807d0ea55..8a6db8f63e7f0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7342,8 +7342,6 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): return self._clip_with_scalar(None, threshold, inplace=inplace) return self._clip_with_scalar(threshold, None, inplace=inplace) - subset = method(threshold, axis=axis) | isna(self) - # GH #15390 # In order for where method to work, the threshold must # be transformed to NDFrame from other array like structure. @@ -7353,13 +7351,22 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): else: threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] + # GH 40420 + if is_list_like(threshold): + fill_value = np.inf if method.__name__ == "le" else -np.inf + threshold_inf = threshold.fillna(fill_value) + else: + threshold_inf = threshold + + subset = method(threshold_inf, axis=axis) | isna(self) + # GH 40420 # In order to ignore nan values in the threshold, set the values in # subset that correspond to these na values to True. This indicates to the # final where() to not clip. - if is_list_like(threshold) and threshold.isna().any(axis=None): - subset_kwargs = {"axis": axis} if threshold.ndim != subset.ndim else {} - subset = subset.where(threshold.notna(), True, **subset_kwargs) + # if is_list_like(threshold) and threshold.isna().any(axis=None): + # subset_kwargs = {"axis": axis} if threshold.ndim != subset.ndim else {} + # subset = subset.where(threshold.notna(), True, **subset_kwargs) return self.where(subset, threshold, axis=axis, inplace=inplace) @overload From 19005422dc3410812bb1f8cf9c95c3599c897b33 Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Fri, 16 Apr 2021 04:14:41 +0200 Subject: [PATCH 10/13] Remove leftover code. --- pandas/core/generic.py | 68 +++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1add28bb53ce6..342bc048dbd55 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7352,6 +7352,7 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] # GH 40420 + # Treat missing thresholds as no bounds, not clipping the values if is_list_like(threshold): fill_value = np.inf if method.__name__ == "le" else -np.inf threshold_inf = threshold.fillna(fill_value) @@ -7361,12 +7362,6 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): subset = method(threshold_inf, axis=axis) | isna(self) # GH 40420 - # In order to ignore nan values in the threshold, set the values in - # subset that correspond to these na values to True. This indicates to the - # final where() to not clip. - # if is_list_like(threshold) and threshold.isna().any(axis=None): - # subset_kwargs = {"axis": axis} if threshold.ndim != subset.ndim else {} - # subset = subset.where(threshold.notna(), True, **subset_kwargs) return self.where(subset, threshold, axis=axis, inplace=inplace) @overload @@ -8857,7 +8852,15 @@ def _align_frame( right = right.fillna(method=method, axis=fill_axis, limit=limit) # if DatetimeIndex have different tz, convert to UTC - left, right = _align_as_utc(left, right, join_index) + if is_datetime64tz_dtype(left.index.dtype): + if left.index.tz != right.index.tz: + if join_index is not None: + # GH#33671 ensure we don't change the index on + # our original Series (NB: by default deep=False) + left = left.copy() + right = right.copy() + left.index = join_index + right.index = join_index return ( left.__finalize__(self), @@ -8899,18 +8902,27 @@ def _align_series( else: # one has > 1 ndim fdata = self._mgr - if axis in [0, 1]: - join_index = self.axes[axis] + if axis == 0: + join_index = self.index lidx, ridx = None, None - if not join_index.equals(other.index): - join_index, lidx, ridx = join_index.join( + if not self.index.equals(other.index): + join_index, lidx, ridx = self.index.join( other.index, how=join, level=level, return_indexers=True ) if lidx is not None: - bm_axis = self._get_block_manager_axis(axis) - fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis) + fdata = fdata.reindex_indexer(join_index, lidx, axis=1) + elif axis == 1: + join_index = self.columns + lidx, ridx = None, None + if not self.columns.equals(other.index): + join_index, lidx, ridx = self.columns.join( + other.index, how=join, level=level, return_indexers=True + ) + + if lidx is not None: + fdata = fdata.reindex_indexer(join_index, lidx, axis=0) else: raise ValueError("Must specify axis=0 or 1") @@ -8932,7 +8944,15 @@ def _align_series( # if DatetimeIndex have different tz, convert to UTC if is_series or (not is_series and axis == 0): - left, right = _align_as_utc(left, right, join_index) + if is_datetime64tz_dtype(left.index.dtype): + if left.index.tz != right.index.tz: + if join_index is not None: + # GH#33671 ensure we don't change the index on + # our original Series (NB: by default deep=False) + left = left.copy() + right = right.copy() + left.index = join_index + right.index = join_index return ( left.__finalize__(self), @@ -11906,23 +11926,3 @@ def _doc_params(cls): The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. """ - - -def _align_as_utc( - left: FrameOrSeries, right: FrameOrSeries, join_index: Index | None -) -> tuple[FrameOrSeries, FrameOrSeries]: - """ - If we are aligning timezone-aware DatetimeIndexes and the timezones - do not match, convert both to UTC. - """ - if is_datetime64tz_dtype(left.index.dtype): - if left.index.tz != right.index.tz: - if join_index is not None: - # GH#33671 ensure we don't change the index on - # our original Series (NB: by default deep=False) - left = left.copy() - right = right.copy() - left.index = join_index - right.index = join_index - - return left, right From 51b9006a354d642b75abd4f98e941d9476266808 Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Fri, 16 Apr 2021 04:22:37 +0200 Subject: [PATCH 11/13] Fix bad merge. --- pandas/core/generic.py | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 342bc048dbd55..ba9bd00abe439 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8852,15 +8852,7 @@ def _align_frame( right = right.fillna(method=method, axis=fill_axis, limit=limit) # if DatetimeIndex have different tz, convert to UTC - if is_datetime64tz_dtype(left.index.dtype): - if left.index.tz != right.index.tz: - if join_index is not None: - # GH#33671 ensure we don't change the index on - # our original Series (NB: by default deep=False) - left = left.copy() - right = right.copy() - left.index = join_index - right.index = join_index + left, right = _align_as_utc(left, right, join_index) return ( left.__finalize__(self), @@ -8902,16 +8894,17 @@ def _align_series( else: # one has > 1 ndim fdata = self._mgr - if axis == 0: - join_index = self.index + if axis in [0, 1]: + join_index = self.axes[axis] lidx, ridx = None, None - if not self.index.equals(other.index): - join_index, lidx, ridx = self.index.join( + if not join_index.equals(other.index): + join_index, lidx, ridx = join_index.join( other.index, how=join, level=level, return_indexers=True ) if lidx is not None: - fdata = fdata.reindex_indexer(join_index, lidx, axis=1) + bm_axis = self._get_block_manager_axis(axis) + fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis) elif axis == 1: join_index = self.columns @@ -11926,3 +11919,23 @@ def _doc_params(cls): The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. """ + + +def _align_as_utc( + left: FrameOrSeries, right: FrameOrSeries, join_index: Index | None +) -> tuple[FrameOrSeries, FrameOrSeries]: + """ + If we are aligning timezone-aware DatetimeIndexes and the timezones + do not match, convert both to UTC. + """ + if is_datetime64tz_dtype(left.index.dtype): + if left.index.tz != right.index.tz: + if join_index is not None: + # GH#33671 ensure we don't change the index on + # our original Series (NB: by default deep=False) + left = left.copy() + right = right.copy() + left.index = join_index + right.index = join_index + + return left, right From c3523c3e07920aa5ad611692cd2a173b45af08a9 Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Fri, 16 Apr 2021 04:24:22 +0200 Subject: [PATCH 12/13] Fix bad merge pt.2 --- pandas/core/generic.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba9bd00abe439..5f0a4a0a28265 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8937,15 +8937,7 @@ def _align_series( # if DatetimeIndex have different tz, convert to UTC if is_series or (not is_series and axis == 0): - if is_datetime64tz_dtype(left.index.dtype): - if left.index.tz != right.index.tz: - if join_index is not None: - # GH#33671 ensure we don't change the index on - # our original Series (NB: by default deep=False) - left = left.copy() - right = right.copy() - left.index = join_index - right.index = join_index + left, right = _align_as_utc(left, right, join_index) return ( left.__finalize__(self), From 9c033bd7f221ee325e26c75939df0d1b0aad58b4 Mon Sep 17 00:00:00 2001 From: Dries Schaumont Date: Fri, 16 Apr 2021 04:25:41 +0200 Subject: [PATCH 13/13] Fix bad merge pt.3 --- pandas/core/generic.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5f0a4a0a28265..cebd672cbf02d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8906,16 +8906,6 @@ def _align_series( bm_axis = self._get_block_manager_axis(axis) fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis) - elif axis == 1: - join_index = self.columns - lidx, ridx = None, None - if not self.columns.equals(other.index): - join_index, lidx, ridx = self.columns.join( - other.index, how=join, level=level, return_indexers=True - ) - - if lidx is not None: - fdata = fdata.reindex_indexer(join_index, lidx, axis=0) else: raise ValueError("Must specify axis=0 or 1")