From 53cd9df7ffeeaec5fcedb03f8bf61950692087e2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Feb 2021 21:56:23 +0100 Subject: [PATCH 1/6] [ArrayManager] Indexing - implement iset --- .github/workflows/ci.yml | 6 ++++++ pandas/core/frame.py | 9 -------- pandas/core/internals/array_manager.py | 30 ++++++++++++++++++++++---- pandas/core/internals/managers.py | 3 +-- 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b551e7ded0178..61c93f1908ddf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,3 +157,9 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + + # indexing iset related (temporary since other tests don't pass yet) + pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_multi_index --array-manager + pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_listlike_indexer_duplicate_columns --array-manager + pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups --array-manager + pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column --array-manager diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1d633aec79e93..c9d2b043af859 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9720,12 +9720,3 @@ def _reindex_for_setitem(value: FrameOrSeriesUnion, index: Index) -> ArrayLike: "incompatible index of inserted column with frame index" ) from err return reindexed_value - - -def _maybe_atleast_2d(value): - # TODO(EA2D): not needed with 2D EAs - - if is_extension_array_dtype(value): - return value - - return np.atleast_2d(np.asarray(value)) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a8493e647f39a..33effb7ad52fd 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -665,18 +665,40 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): if lib.is_integer(loc): # TODO normalize array -> this should in theory not be needed? value = extract_array(value, extract_numpy=True) + + # TODO can we avoid needing to unpack this here? That means converting + # DataFrame into 1D array when loc is an integer if isinstance(value, np.ndarray) and value.ndim == 2: + assert value.shape[1] == 1 value = value[0, :] assert isinstance(value, (np.ndarray, ExtensionArray)) - # value = np.asarray(value) - # assert isinstance(value, np.ndarray) + assert value.ndim == 1 assert len(value) == len(self._axes[0]) self.arrays[loc] = value return - # TODO - raise Exception + if isinstance(loc, slice): + indices = range( + loc.start if loc.start is not None else 0, + loc.stop if loc.stop is not None else self.shape_proper[1], + loc.step if loc.step is not None else 1, + ) + else: + assert isinstance(loc, np.ndarray) + if loc.dtype == "bool": + indices = np.nonzero(loc)[0] + else: + # TODO reachable? + indices = loc + + assert value.ndim == 2 + assert value.shape[0] == len(self._axes[0]) + + for value_idx, mgr_idx in enumerate(indices): + value_arr = value[:, value_idx] + self.arrays[mgr_idx] = value_arr + return def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7084fa4013900..3da65bc4a8526 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1140,8 +1140,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False if value.ndim == 2: value = value.T - - if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): + elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): # TODO(EA2D): special case not needed with 2D EAs value = safe_reshape(value, (1,) + value.shape) From a6aa3b61d4d3bb34006dbf0a3ce25d20a78da5eb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Feb 2021 15:56:26 +0100 Subject: [PATCH 2/6] update docstring --- pandas/core/internals/array_manager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 33effb7ad52fd..29e6a770658a3 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -659,9 +659,14 @@ def idelete(self, indexer): def iset(self, loc: Union[int, slice, np.ndarray], value): """ - Set new item in-place. Does not consolidate. Adds new Block if not - contained in the current set of items + Set new column in-place (replaces (an) existing column(s)). + + Parameters + ---------- + loc : positional location (already bounds checked) + value : array-like """ + # single column if lib.is_integer(loc): # TODO normalize array -> this should in theory not be needed? value = extract_array(value, extract_numpy=True) @@ -678,6 +683,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): self.arrays[loc] = value return + # multiple columns (slice or array) if isinstance(loc, slice): indices = range( loc.start if loc.start is not None else 0, From 186806ee937edc53b8b25d83950320c788ba6470 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Feb 2021 15:58:58 +0100 Subject: [PATCH 3/6] elif --- pandas/core/internals/array_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 29e6a770658a3..2c7f4badc441f 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -666,7 +666,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): loc : positional location (already bounds checked) value : array-like """ - # single column + # single column -> single integer index if lib.is_integer(loc): # TODO normalize array -> this should in theory not be needed? value = extract_array(value, extract_numpy=True) @@ -683,8 +683,8 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): self.arrays[loc] = value return - # multiple columns (slice or array) - if isinstance(loc, slice): + # multiple columns -> convert slice or array to integer indices + elif isinstance(loc, slice): indices = range( loc.start if loc.start is not None else 0, loc.stop if loc.stop is not None else self.shape_proper[1], From 5c0bb93a8aed8426276d4b443b033d88d0c1846a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Feb 2021 16:03:25 +0100 Subject: [PATCH 4/6] limit to boolean mask if array --- pandas/core/internals/array_manager.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 2c7f4badc441f..bc2795727c3bd 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -663,7 +663,8 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): Parameters ---------- - loc : positional location (already bounds checked) + loc : integer, slice or boolean mask + Positional location (already bounds checked) value : array-like """ # single column -> single integer index @@ -692,11 +693,8 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): ) else: assert isinstance(loc, np.ndarray) - if loc.dtype == "bool": - indices = np.nonzero(loc)[0] - else: - # TODO reachable? - indices = loc + assert loc.dtype == "bool" + indices = np.nonzero(loc)[0] assert value.ndim == 2 assert value.shape[0] == len(self._axes[0]) From ae12bdb0c2b4c48dec28fbbf6279671a784502c8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Feb 2021 20:43:25 +0100 Subject: [PATCH 5/6] update comment --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index bc2795727c3bd..9077c264efc6d 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -669,7 +669,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): """ # single column -> single integer index if lib.is_integer(loc): - # TODO normalize array -> this should in theory not be needed? + # TODO the extract array should in theory not be needed? value = extract_array(value, extract_numpy=True) # TODO can we avoid needing to unpack this here? That means converting From 1866121e5e26a2df12984a93e3c7ee2717d5bf60 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Feb 2021 11:54:15 +0100 Subject: [PATCH 6/6] update docstring --- pandas/core/internals/array_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 9077c264efc6d..083f32488acd4 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -659,7 +659,10 @@ def idelete(self, indexer): def iset(self, loc: Union[int, slice, np.ndarray], value): """ - Set new column in-place (replaces (an) existing column(s)). + Set new column(s). + + This changes the ArrayManager in-place, but replaces (an) existing + column(s), not changing column values in-place). Parameters ----------