From 91d8acc892df381ceb2666495287b410aae9d1f6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Feb 2023 21:58:35 +0100 Subject: [PATCH 1/5] ENH: Improve ref-tracking for group keys --- pandas/core/groupby/grouper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 88780bac06637..92f5ffd4ccb6e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -11,6 +11,7 @@ final, ) import warnings +import weakref import numpy as np @@ -947,9 +948,11 @@ def is_in_obj(gpr) -> bool: # For the CoW case, we need an equality check as the identity check # no longer works (each Series from column access is a new object) try: - return gpr.equals(obj[gpr.name]) + obj_gpr_column = obj[gpr.name] except (AttributeError, KeyError, IndexError, InvalidIndexError): return False + ref = weakref.ref(obj_gpr_column._mgr.blocks[0]) + return ref in gpr._mgr.blocks[0].refs.referenced_blocks try: return gpr is obj[gpr.name] except (KeyError, IndexError, InvalidIndexError): From c2c77c6a1195a58f205a2eb34a5cbe0c3d7773ca Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Feb 2023 23:38:57 +0100 Subject: [PATCH 2/5] Fix --- pandas/core/groupby/grouper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 92f5ffd4ccb6e..323f9f4f57340 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -951,8 +951,10 @@ def is_in_obj(gpr) -> bool: obj_gpr_column = obj[gpr.name] except (AttributeError, KeyError, IndexError, InvalidIndexError): return False - ref = weakref.ref(obj_gpr_column._mgr.blocks[0]) - return ref in gpr._mgr.blocks[0].refs.referenced_blocks + if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): + ref = weakref.ref(obj_gpr_column._mgr.blocks[0]) + return ref in gpr._mgr.blocks[0].refs.referenced_blocks + return False try: return gpr is obj[gpr.name] except (KeyError, IndexError, InvalidIndexError): From 0873d502d1a1615e3c6836715a2354c542c8b655 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Feb 2023 11:25:30 +0100 Subject: [PATCH 3/5] Refactor --- pandas/core/groupby/grouper.py | 11 ++++++++--- pandas/core/internals/managers.py | 9 +++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 323f9f4f57340..2265e97fb00d2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -11,7 +11,6 @@ final, ) import warnings -import weakref import numpy as np @@ -952,8 +951,14 @@ def is_in_obj(gpr) -> bool: except (AttributeError, KeyError, IndexError, InvalidIndexError): return False if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): - ref = weakref.ref(obj_gpr_column._mgr.blocks[0]) - return ref in gpr._mgr.blocks[0].refs.referenced_blocks + # Item "SingleArrayManager" of "Union[SingleArrayManager, + # SingleBlockManager]" has no attribute "references_same_values" + # Argument 1 to "references_same_values" of "BaseBlockManager" has + # incompatible type "Union[SingleArrayManager, SingleBlockManager]"; + # expected "BaseBlockManager + return gpr._mgr.references_same_values( # type: ignore[union-attr] + obj_gpr_column._mgr, 0 # type: ignore[arg-type] + ) return False try: return gpr is obj[gpr.name] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 892205cebf6a8..8121cc342c944 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -11,6 +11,7 @@ cast, ) import warnings +import weakref import numpy as np @@ -247,6 +248,14 @@ def _has_no_reference_block(self, blkno: int) -> bool: """ return not self.blocks[blkno].refs.has_reference() + def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: + """ + Checks if two blocks from two different block managers reference the + same underlying values. + """ + ref = weakref.ref(self.blocks[blkno]) + return ref in mgr.blocks[blkno].refs.referenced_blocks + def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return dtypes.take(self.blknos) From b7270b4f15836ed9bdf11ed397e40a43c0c0f051 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Feb 2023 19:00:59 +0100 Subject: [PATCH 4/5] Address review --- pandas/core/groupby/grouper.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2265e97fb00d2..d488db5d85123 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -948,14 +948,9 @@ def is_in_obj(gpr) -> bool: # no longer works (each Series from column access is a new object) try: obj_gpr_column = obj[gpr.name] - except (AttributeError, KeyError, IndexError, InvalidIndexError): + except (KeyError, IndexError, InvalidIndexError): return False if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): - # Item "SingleArrayManager" of "Union[SingleArrayManager, - # SingleBlockManager]" has no attribute "references_same_values" - # Argument 1 to "references_same_values" of "BaseBlockManager" has - # incompatible type "Union[SingleArrayManager, SingleBlockManager]"; - # expected "BaseBlockManager return gpr._mgr.references_same_values( # type: ignore[union-attr] obj_gpr_column._mgr, 0 # type: ignore[arg-type] ) From 721dcc106046841b4ad13fc73f80cde045221879 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Feb 2023 19:01:23 +0100 Subject: [PATCH 5/5] Address review --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d488db5d85123..f735ce682fc83 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -944,8 +944,8 @@ def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False if using_copy_on_write(): - # For the CoW case, we need an equality check as the identity check - # no longer works (each Series from column access is a new object) + # For the CoW case, we check the references to determine if the + # series is part of the object try: obj_gpr_column = obj[gpr.name] except (KeyError, IndexError, InvalidIndexError):