From 390de2beb9eb72b5cdd52a6ba9a1e2f81d895f34 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 26 Mar 2020 14:36:44 -0700 Subject: [PATCH 1/2] CLN: remove BlockManager.get --- pandas/core/frame.py | 25 +++--- pandas/core/generic.py | 12 +-- pandas/core/internals/managers.py | 29 ------ pandas/tests/internals/test_internals.py | 107 ++++++++++++----------- 4 files changed, 78 insertions(+), 95 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8deeb415c17c9..d915b28556e9f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2560,7 +2560,7 @@ def _ixs(self, i: int, axis: int = 0): label = self.columns[i] values = self._data.iget(i) - result = self._box_col_values(values, label) + result = self._box_col_values(values, i) # this is a cached value, mark it so result._set_as_cached(label, self) @@ -2650,7 +2650,7 @@ def _getitem_bool_array(self, key): def _getitem_multilevel(self, key): # self.columns is a MultiIndex loc = self.columns.get_loc(key) - if isinstance(loc, (slice, Series, np.ndarray, Index)): + if isinstance(loc, (slice, np.ndarray)): new_columns = self.columns[loc] result_columns = maybe_droplevels(new_columns, key) if self._is_mixed_type: @@ -2683,7 +2683,8 @@ def _getitem_multilevel(self, key): result._set_is_copy(self) return result else: - return self._get_item_cache(key) + # loc is neither a slice nor ndarray, so must be an int + return self._ixs(loc, axis=1) def _get_value(self, index, col, takeable: bool = False): """ @@ -2703,6 +2704,8 @@ def _get_value(self, index, col, takeable: bool = False): series = self._ixs(col, axis=1) return series._values[index] + assert self.columns.is_unique + series = self._get_item_cache(col) engine = self.index._engine @@ -2839,6 +2842,8 @@ def _set_value(self, index, col, value, takeable: bool = False): series._set_value(index, value, takeable=True) return + assert self.columns.is_unique + series = self._get_item_cache(col) engine = self.index._engine loc = engine.get_loc(index) @@ -2874,19 +2879,15 @@ def _ensure_valid_index(self, value): value.index.copy(), axis=1, fill_value=np.nan ) - def _box_item_values(self, key, values): - items = self.columns[self.columns.get_loc(key)] - if values.ndim == 2: - return self._constructor(values.T, columns=items, index=self.index) - else: - return self._box_col_values(values, items) - - def _box_col_values(self, values, items): + def _box_col_values(self, values, loc: int) -> Series: """ Provide boxed values for a column. """ + # Lookup in columns so that if e.g. a str datetime was passed + # we attach the Timestamp object as the name. + name = self.columns[loc] klass = self._constructor_sliced - return klass(values, index=self.index, name=items, fastpath=True) + return klass(values, index=self.index, name=name, fastpath=True) # ---------------------------------------------------------------------- # Unsorted diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b9ce7cce3685c..c867bbb8553aa 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3546,8 +3546,13 @@ def _get_item_cache(self, item): cache = self._item_cache res = cache.get(item) if res is None: - values = self._data.get(item) - res = self._box_item_values(item, values) + # All places that call _get_item_cache have unique columns, + # assuming this lets us simplify whats left + assert self.columns.is_unique + + loc = self.columns.get_loc(item) + values = self._data.iget(loc) + res = self._box_col_values(values, loc) cache[item] = res res._set_as_cached(item, self) @@ -3555,9 +3560,6 @@ def _get_item_cache(self, item): res._is_copy = self._is_copy return res - def _box_item_values(self, key, values): - raise AbstractMethodError(self) - def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: """ Construct a slice of this container. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dda932cafe73b..5372803b3f1ba 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -965,35 +965,6 @@ def _consolidate_inplace(self) -> None: self._known_consolidated = True self._rebuild_blknos_and_blklocs() - def get(self, item): - """ - Return values for selected item (ndarray or BlockManager). - """ - if self.items.is_unique: - - if not isna(item): - loc = self.items.get_loc(item) - else: - indexer = np.arange(len(self.items))[isna(self.items)] - - # allow a single nan location indexer - if not is_scalar(indexer): - if len(indexer) == 1: - loc = indexer.item() - else: - raise ValueError("cannot label index with a null key") - - return self.iget(loc) - else: - - if isna(item): - raise TypeError("cannot label index with a null key") - - indexer = self.items.get_indexer_for([item]) - return self.reindex_indexer( - new_axis=self.items[indexer], indexer=indexer, axis=0, allow_dups=True - ) - def iget(self, i: int) -> "SingleBlockManager": """ Return the data as a SingleBlockManager. diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index bbf968aef4a5c..66607a93e7998 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -348,45 +348,48 @@ def test_categorical_block_pickle(self): smgr2 = tm.round_trip_pickle(smgr) tm.assert_series_equal(Series(smgr), Series(smgr2)) - def test_get(self): + def test_iget(self): cols = Index(list("abc")) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) - tm.assert_almost_equal(mgr.get("a").internal_values(), values[0]) - tm.assert_almost_equal(mgr.get("b").internal_values(), values[1]) - tm.assert_almost_equal(mgr.get("c").internal_values(), values[2]) + tm.assert_almost_equal(mgr.iget(0).internal_values(), values[0]) + tm.assert_almost_equal(mgr.iget(1).internal_values(), values[1]) + tm.assert_almost_equal(mgr.iget(2).internal_values(), values[2]) def test_set(self): mgr = create_mgr("a,b,c: int", item_shape=(3,)) mgr.set("d", np.array(["foo"] * 3)) mgr.set("b", np.array(["bar"] * 3)) - tm.assert_numpy_array_equal(mgr.get("a").internal_values(), np.array([0] * 3)) + tm.assert_numpy_array_equal(mgr.iget(0).internal_values(), np.array([0] * 3)) tm.assert_numpy_array_equal( - mgr.get("b").internal_values(), np.array(["bar"] * 3, dtype=np.object_) + mgr.iget(1).internal_values(), np.array(["bar"] * 3, dtype=np.object_) ) - tm.assert_numpy_array_equal(mgr.get("c").internal_values(), np.array([2] * 3)) + tm.assert_numpy_array_equal(mgr.iget(2).internal_values(), np.array([2] * 3)) tm.assert_numpy_array_equal( - mgr.get("d").internal_values(), np.array(["foo"] * 3, dtype=np.object_) + mgr.iget(3).internal_values(), np.array(["foo"] * 3, dtype=np.object_) ) def test_set_change_dtype(self, mgr): mgr.set("baz", np.zeros(N, dtype=bool)) mgr.set("baz", np.repeat("foo", N)) - assert mgr.get("baz").dtype == np.object_ + idx = mgr.items.get_loc("baz") + assert mgr.iget(idx).dtype == np.object_ mgr2 = mgr.consolidate() mgr2.set("baz", np.repeat("foo", N)) - assert mgr2.get("baz").dtype == np.object_ + idx = mgr2.items.get_loc("baz") + assert mgr2.iget(idx).dtype == np.object_ mgr2.set("quux", tm.randn(N).astype(int)) - assert mgr2.get("quux").dtype == np.int_ + idx = mgr2.items.get_loc("quux") + assert mgr2.iget(idx).dtype == np.int_ mgr2.set("quux", tm.randn(N)) - assert mgr2.get("quux").dtype == np.float_ + assert mgr2.iget(idx).dtype == np.float_ def test_copy(self, mgr): cp = mgr.copy(deep=False) @@ -449,8 +452,8 @@ def test_as_array_datetime(self): def test_as_array_datetime_tz(self): mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]") - assert mgr.get("h").dtype == "datetime64[ns, US/Eastern]" - assert mgr.get("g").dtype == "datetime64[ns, CET]" + assert mgr.iget(0).dtype == "datetime64[ns, US/Eastern]" + assert mgr.iget(1).dtype == "datetime64[ns, CET]" assert mgr.as_array().dtype == "object" @pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"]) @@ -460,26 +463,26 @@ def test_astype(self, t): t = np.dtype(t) tmgr = mgr.astype(t) - assert tmgr.get("c").dtype.type == t - assert tmgr.get("d").dtype.type == t - assert tmgr.get("e").dtype.type == t + assert tmgr.iget(0).dtype.type == t + assert tmgr.iget(1).dtype.type == t + assert tmgr.iget(2).dtype.type == t # mixed mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8") t = np.dtype(t) tmgr = mgr.astype(t, errors="ignore") - assert tmgr.get("c").dtype.type == t - assert tmgr.get("e").dtype.type == t - assert tmgr.get("f").dtype.type == t - assert tmgr.get("g").dtype.type == t + assert tmgr.iget(2).dtype.type == t + assert tmgr.iget(4).dtype.type == t + assert tmgr.iget(5).dtype.type == t + assert tmgr.iget(6).dtype.type == t - assert tmgr.get("a").dtype.type == np.object_ - assert tmgr.get("b").dtype.type == np.object_ + assert tmgr.iget(0).dtype.type == np.object_ + assert tmgr.iget(1).dtype.type == np.object_ if t != np.int64: - assert tmgr.get("d").dtype.type == np.datetime64 + assert tmgr.iget(3).dtype.type == np.datetime64 else: - assert tmgr.get("d").dtype.type == t + assert tmgr.iget(3).dtype.type == t def test_convert(self): def _compare(old_mgr, new_mgr): @@ -516,11 +519,11 @@ def _compare(old_mgr, new_mgr): mgr.set("b", np.array(["2."] * N, dtype=np.object_)) mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get("a").dtype == np.int64 - assert new_mgr.get("b").dtype == np.float64 - assert new_mgr.get("foo").dtype == np.object_ - assert new_mgr.get("f").dtype == np.int64 - assert new_mgr.get("g").dtype == np.float64 + assert new_mgr.iget(0).dtype == np.int64 + assert new_mgr.iget(1).dtype == np.float64 + assert new_mgr.iget(2).dtype == np.object_ + assert new_mgr.iget(3).dtype == np.int64 + assert new_mgr.iget(4).dtype == np.float64 mgr = create_mgr( "a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2" @@ -529,15 +532,15 @@ def _compare(old_mgr, new_mgr): mgr.set("b", np.array(["2."] * N, dtype=np.object_)) mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get("a").dtype == np.int64 - assert new_mgr.get("b").dtype == np.float64 - assert new_mgr.get("foo").dtype == np.object_ - assert new_mgr.get("f").dtype == np.int32 - assert new_mgr.get("bool").dtype == np.bool_ - assert new_mgr.get("dt").dtype.type, np.datetime64 - assert new_mgr.get("i").dtype == np.int64 - assert new_mgr.get("g").dtype == np.float64 - assert new_mgr.get("h").dtype == np.float16 + assert new_mgr.iget(0).dtype == np.int64 + assert new_mgr.iget(1).dtype == np.float64 + assert new_mgr.iget(2).dtype == np.object_ + assert new_mgr.iget(3).dtype == np.int32 + assert new_mgr.iget(4).dtype == np.bool_ + assert new_mgr.iget(5).dtype.type, np.datetime64 + assert new_mgr.iget(6).dtype == np.int64 + assert new_mgr.iget(7).dtype == np.float64 + assert new_mgr.iget(8).dtype == np.float16 def test_invalid_ea_block(self): with pytest.raises(AssertionError, match="block.size != values.size"): @@ -639,16 +642,16 @@ def test_reindex_items(self): assert reindexed.nblocks == 2 tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) tm.assert_almost_equal( - mgr.get("g").internal_values(), reindexed.get("g").internal_values() + mgr.iget(6).internal_values(), reindexed.iget(0).internal_values() ) tm.assert_almost_equal( - mgr.get("c").internal_values(), reindexed.get("c").internal_values() + mgr.iget(2).internal_values(), reindexed.iget(1).internal_values() ) tm.assert_almost_equal( - mgr.get("a").internal_values(), reindexed.get("a").internal_values() + mgr.iget(0).internal_values(), reindexed.iget(2).internal_values() ) tm.assert_almost_equal( - mgr.get("d").internal_values(), reindexed.get("d").internal_values() + mgr.iget(3).internal_values(), reindexed.iget(3).internal_values() ) def test_get_numeric_data(self): @@ -664,13 +667,15 @@ def test_get_numeric_data(self): numeric.items, pd.Index(["int", "float", "complex", "bool"]) ) tm.assert_almost_equal( - mgr.get("float").internal_values(), numeric.get("float").internal_values() + mgr.iget(mgr.items.get_loc("float")).internal_values(), + numeric.iget(numeric.items.get_loc("float")).internal_values(), ) # Check sharing numeric.set("float", np.array([100.0, 200.0, 300.0])) tm.assert_almost_equal( - mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), ) numeric2 = mgr.get_numeric_data(copy=True) @@ -679,7 +684,8 @@ def test_get_numeric_data(self): ) numeric2.set("float", np.array([1000.0, 2000.0, 3000.0])) tm.assert_almost_equal( - mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), ) def test_get_bool_data(self): @@ -693,19 +699,22 @@ def test_get_bool_data(self): bools = mgr.get_bool_data() tm.assert_index_equal(bools.items, pd.Index(["bool"])) tm.assert_almost_equal( - mgr.get("bool").internal_values(), bools.get("bool").internal_values() + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + bools.iget(bools.items.get_loc("bool")).internal_values(), ) bools.set("bool", np.array([True, False, True])) tm.assert_numpy_array_equal( - mgr.get("bool").internal_values(), np.array([True, False, True]) + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), ) # Check sharing bools2 = mgr.get_bool_data(copy=True) bools2.set("bool", np.array([False, True, False])) tm.assert_numpy_array_equal( - mgr.get("bool").internal_values(), np.array([True, False, True]) + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), ) def test_unicode_repr_doesnt_raise(self): From 5309605a794039584d0ad25cd3b9139e3923ac52 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 12:29:32 -0700 Subject: [PATCH 2/2] Remove assertions --- pandas/core/frame.py | 4 ---- pandas/core/generic.py | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fcbbb96cd1eda..de03ed4ebcfd1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2699,8 +2699,6 @@ def _get_value(self, index, col, takeable: bool = False): series = self._ixs(col, axis=1) return series._values[index] - assert self.columns.is_unique - series = self._get_item_cache(col) engine = self.index._engine @@ -2837,8 +2835,6 @@ def _set_value(self, index, col, value, takeable: bool = False): series._set_value(index, value, takeable=True) return - assert self.columns.is_unique - series = self._get_item_cache(col) engine = self.index._engine loc = engine.get_loc(index) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb2eff52b7f85..e1ec80858e7b2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3548,8 +3548,7 @@ def _get_item_cache(self, item): res = cache.get(item) if res is None: # All places that call _get_item_cache have unique columns, - # assuming this lets us simplify whats left - assert self.columns.is_unique + # pending resolution of GH#33047 loc = self.columns.get_loc(item) values = self._mgr.iget(loc)