REF: Remove BlockManager.arrays in favor of BlockManager.blocks usage (#58804)

mroeschke · web-flow · commit 199bf2084a6e · 2024-06-03T11:44:57.000-07:00
* REF: Remove BlockManager.arrays in favor of BlockManager.blocks usage

* Add back arrays

* Whitespace
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -531,8 +531,8 @@ def shares_memory(left, right) -> bool:
             left._mask, right._mask
         )
 
-    if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1:
-        arr = left._mgr.arrays[0]
+    if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1:
+        arr = left._mgr.blocks[0].values
         return shares_memory(arr, right)
 
     raise NotImplementedError(type(left), type(right))
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1046,7 +1046,7 @@ def _is_homogeneous_type(self) -> bool:
         False
         """
         # The "<" part of "<=" here is for empty DataFrame cases
-        return len({arr.dtype for arr in self._mgr.arrays}) <= 1
+        return len({block.values.dtype for block in self._mgr.blocks}) <= 1
 
     @property
     def _can_fast_transpose(self) -> bool:
@@ -5726,7 +5726,6 @@ def shift(
         periods = cast(int, periods)
 
         ncols = len(self.columns)
-        arrays = self._mgr.arrays
         if axis == 1 and periods != 0 and ncols > 0 and freq is None:
             if fill_value is lib.no_default:
                 # We will infer fill_value to match the closest column
@@ -5752,12 +5751,12 @@ def shift(
 
                 result.columns = self.columns.copy()
                 return result
-            elif len(arrays) > 1 or (
+            elif len(self._mgr.blocks) > 1 or (
                 # If we only have one block and we know that we can't
                 #  keep the same dtype (i.e. the _can_hold_element check)
                 #  then we can go through the reindex_indexer path
                 #  (and avoid casting logic in the Block method).
-                not can_hold_element(arrays[0], fill_value)
+                not can_hold_element(self._mgr.blocks[0].values, fill_value)
             ):
                 # GH#35488 we need to watch out for multi-block cases
                 # We only get here with fill_value not-lib.no_default
@@ -11453,7 +11452,7 @@ def _get_data() -> DataFrame:
         if numeric_only:
             df = _get_data()
         if axis is None:
-            dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
+            dtype = find_common_type([block.values.dtype for block in df._mgr.blocks])
             if isinstance(dtype, ExtensionDtype):
                 df = df.astype(dtype)
                 arr = concat_compat(list(df._iter_column_arrays()))
@@ -11478,7 +11477,9 @@ def _get_data() -> DataFrame:
 
             # kurtosis excluded since groupby does not implement it
             if df.shape[1] and name != "kurt":
-                dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
+                dtype = find_common_type(
+                    [block.values.dtype for block in df._mgr.blocks]
+                )
                 if isinstance(dtype, ExtensionDtype):
                     # GH 54341: fastpath for EA-backed axis=1 reductions
                     # This flattens the frame into a single 1D array while keeping
@@ -11552,8 +11553,8 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
         else:
             raise NotImplementedError(name)
 
-        for arr in self._mgr.arrays:
-            middle = func(arr, axis=0, skipna=skipna)
+        for blocks in self._mgr.blocks:
+            middle = func(blocks.values, axis=0, skipna=skipna)
             result = ufunc(result, middle)
 
         res_ser = self._constructor_sliced(result, index=self.index, copy=False)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6373,7 +6373,7 @@ def astype(
             # TODO(EA2D): special case not needed with 2D EAs
             dtype = pandas_dtype(dtype)
             if isinstance(dtype, ExtensionDtype) and all(
-                arr.dtype == dtype for arr in self._mgr.arrays
+                block.values.dtype == dtype for block in self._mgr.blocks
             ):
                 return self.copy(deep=False)
             # GH 18099/22869: columnwise conversion to extension dtype
@@ -11148,9 +11148,9 @@ def _logical_func(
         if (
             self.ndim > 1
             and axis == 1
-            and len(self._mgr.arrays) > 1
+            and len(self._mgr.blocks) > 1
             # TODO(EA2D): special-case not needed
-            and all(x.ndim == 2 for x in self._mgr.arrays)
+            and all(block.values.ndim == 2 for block in self._mgr.blocks)
             and not kwargs
         ):
             # Fastpath avoiding potentially expensive transpose
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1804,10 +1804,10 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None:
 
         # if there is only one block/type, still have to take split path
         # unless the block is one-dimensional or it can hold the value
-        if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1:
+        if not take_split_path and len(self.obj._mgr.blocks) and self.ndim > 1:
             # in case of dict, keys are indices
             val = list(value.values()) if isinstance(value, dict) else value
-            arr = self.obj._mgr.arrays[0]
+            arr = self.obj._mgr.blocks[0].values
             take_split_path = not can_hold_element(
                 arr, extract_array(val, extract_numpy=True)
             )
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -353,6 +353,8 @@ def arrays(self) -> list[ArrayLike]:
         Warning! The returned arrays don't handle Copy-on-Write, so this should
         be used with caution (only in read-mode).
         """
+        # TODO: Deprecate, usage in Dask
+        # https://github.com/dask/dask/blob/484fc3f1136827308db133cd256ba74df7a38d8c/dask/base.py#L1312
         return [blk.values for blk in self.blocks]
 
     def __repr__(self) -> str:
@@ -2068,7 +2070,7 @@ def array(self) -> ArrayLike:
         """
         Quick access to the backing array of the Block.
         """
-        return self.arrays[0]
+        return self.blocks[0].values
 
     # error: Cannot override writeable attribute with read-only property
     @property
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
@@ -287,7 +287,7 @@ def test_transform_groupby_kernel_frame(request, float_frame, op):
     # same thing, but ensuring we have multiple blocks
     assert "E" not in float_frame.columns
     float_frame["E"] = float_frame["A"].copy()
-    assert len(float_frame._mgr.arrays) > 1
+    assert len(float_frame._mgr.blocks) > 1
 
     ones = np.ones(float_frame.shape[0])
     gb2 = float_frame.groupby(ones)
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
@@ -30,8 +30,9 @@ def test_astype_object_frame(self, all_data):
             blk = result._mgr.blocks[0]
             assert isinstance(blk, NumpyBlock), type(blk)
             assert blk.is_object
-        assert isinstance(result._mgr.arrays[0], np.ndarray)
-        assert result._mgr.arrays[0].dtype == np.dtype(object)
+        arr = result._mgr.blocks[0].values
+        assert isinstance(arr, np.ndarray)
+        assert arr.dtype == np.dtype(object)
 
         # check that we can compare the dtypes
         comp = result.dtypes == df.dtypes
diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py
@@ -69,15 +69,15 @@ def test_dataframe_constructor_from_dict(self, data, from_series):
         assert result.shape == (len(data), 1)
         if hasattr(result._mgr, "blocks"):
             assert isinstance(result._mgr.blocks[0], EABackedBlock)
-        assert isinstance(result._mgr.arrays[0], ExtensionArray)
+        assert isinstance(result._mgr.blocks[0].values, ExtensionArray)
 
     def test_dataframe_from_series(self, data):
         result = pd.DataFrame(pd.Series(data))
         assert result.dtypes[0] == data.dtype
         assert result.shape == (len(data), 1)
         if hasattr(result._mgr, "blocks"):
             assert isinstance(result._mgr.blocks[0], EABackedBlock)
-        assert isinstance(result._mgr.arrays[0], ExtensionArray)
+        assert isinstance(result._mgr.blocks[0].values, ExtensionArray)
 
     def test_series_given_mismatched_index_raises(self, data):
         msg = r"Length of values \(3\) does not match length of index \(5\)"
diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
@@ -450,7 +450,7 @@ def test_loc_len1(self, data):
         df = pd.DataFrame({"A": data})
         res = df.loc[[0], "A"]
         assert res.ndim == 1
-        assert res._mgr.arrays[0].ndim == 1
+        assert res._mgr.blocks[0].ndim == 1
         if hasattr(res._mgr, "blocks"):
             assert res._mgr._block.ndim == 1
 
diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
@@ -29,7 +29,7 @@ def test_concat(self, data, in_frame):
         assert dtype == data.dtype
         if hasattr(result._mgr, "blocks"):
             assert isinstance(result._mgr.blocks[0], EABackedBlock)
-        assert isinstance(result._mgr.arrays[0], ExtensionArray)
+        assert isinstance(result._mgr.blocks[0].values, ExtensionArray)
 
     @pytest.mark.parametrize("in_frame", [True, False])
     def test_concat_all_na_block(self, data_missing, in_frame):
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -340,8 +340,8 @@ def test_setitem_dt64tz(self, timezone_frame):
         # assert that A & C are not sharing the same base (e.g. they
         # are copies)
         # Note: This does not hold with Copy on Write (because of lazy copying)
-        v1 = df._mgr.arrays[1]
-        v2 = df._mgr.arrays[2]
+        v1 = df._mgr.blocks[1].values
+        v2 = df._mgr.blocks[2].values
         tm.assert_extension_array_equal(v1, v2)
         v1base = v1._ndarray.base
         v2base = v2._ndarray.base
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -214,7 +214,7 @@ def test_corr_item_cache(self):
         df["B"] = range(10)[::-1]
 
         ser = df["A"]  # populate item_cache
-        assert len(df._mgr.arrays) == 2  # i.e. 2 blocks
+        assert len(df._mgr.blocks) == 2
 
         _ = df.corr(numeric_only=True)
 
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
@@ -47,7 +47,7 @@ def test_fillna_on_column_view(self):
         assert np.isnan(arr[:, 0]).all()
 
         # i.e. we didn't create a new 49-column block
-        assert len(df._mgr.arrays) == 1
+        assert len(df._mgr.blocks) == 1
         assert np.shares_memory(df.values, arr)
 
     def test_fillna_datetime(self, datetime_frame):
diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py
@@ -320,7 +320,7 @@ def test_shift_categorical1(self, frame_or_series):
         def get_cat_values(ndframe):
             # For Series we could just do ._values; for DataFrame
             #  we may be able to do this if we ever have 2D Categoricals
-            return ndframe._mgr.arrays[0]
+            return ndframe._mgr.blocks[0].values
 
         cat = get_cat_values(obj)
 
@@ -560,7 +560,7 @@ def test_shift_dt64values_int_fill_deprecated(self):
         # same thing but not consolidated; pre-2.0 we got different behavior
         df3 = DataFrame({"A": ser})
         df3["B"] = ser
-        assert len(df3._mgr.arrays) == 2
+        assert len(df3._mgr.blocks) == 2
         result = df3.shift(1, axis=1, fill_value=0)
         tm.assert_frame_equal(result, expected)
 
@@ -621,7 +621,7 @@ def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat):
         # same thing but not consolidated
         df3 = DataFrame({"A": ser})
         df3["B"] = ser
-        assert len(df3._mgr.arrays) == 2
+        assert len(df3._mgr.blocks) == 2
         result = df3.shift(-1, axis=1, fill_value="foo")
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py
@@ -256,7 +256,7 @@ def test_private_values_dt64_multiblock(self):
         df = DataFrame({"A": dta[:4]}, copy=False)
         df["B"] = dta[4:]
 
-        assert len(df._mgr.arrays) == 2
+        assert len(df._mgr.blocks) == 2
 
         result = df._values
         expected = dta.reshape(2, 4).T
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -180,24 +180,24 @@ def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series):
             arr = arr[:, 0]
 
         obj = frame_or_series(arr, dtype=object)
-        assert obj._mgr.arrays[0].dtype == object
-        assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
+        assert obj._mgr.blocks[0].values.dtype == object
+        assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
 
         # go through a different path in internals.construction
         obj = frame_or_series(frame_or_series(arr), dtype=object)
-        assert obj._mgr.arrays[0].dtype == object
-        assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
+        assert obj._mgr.blocks[0].values.dtype == object
+        assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
 
         obj = frame_or_series(frame_or_series(arr), dtype=NumpyEADtype(object))
-        assert obj._mgr.arrays[0].dtype == object
-        assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
+        assert obj._mgr.blocks[0].values.dtype == object
+        assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
 
         if frame_or_series is DataFrame:
             # other paths through internals.construction
             sers = [Series(x) for x in arr]
             obj = frame_or_series(sers, dtype=object)
-            assert obj._mgr.arrays[0].dtype == object
-            assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type)
+            assert obj._mgr.blocks[0].values.dtype == object
+            assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
 
     def test_series_with_name_not_matching_column(self):
         # GH#9232
@@ -297,7 +297,7 @@ def test_constructor_dtype_nocast_view_dataframe(self):
     def test_constructor_dtype_nocast_view_2d_array(self):
         df = DataFrame([[1, 2], [3, 4]], dtype="int64")
         df2 = DataFrame(df.values, dtype=df[0].dtype)
-        assert df2._mgr.arrays[0].flags.c_contiguous
+        assert df2._mgr.blocks[0].values.flags.c_contiguous
 
     @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies")
     def test_1d_object_array_does_not_copy(self):
@@ -2493,27 +2493,27 @@ def get_base(obj):
         def check_views(c_only: bool = False):
             # Check that the underlying data behind df["c"] is still `c`
             #  after setting with iloc.  Since we don't know which entry in
-            #  df._mgr.arrays corresponds to df["c"], we just check that exactly
+            #  df._mgr.blocks corresponds to df["c"], we just check that exactly
             #  one of these arrays is `c`.  GH#38939
-            assert sum(x is c for x in df._mgr.arrays) == 1
+            assert sum(x.values is c for x in df._mgr.blocks) == 1
             if c_only:
                 # If we ever stop consolidating in setitem_with_indexer,
                 #  this will become unnecessary.
                 return
 
             assert (
                 sum(
-                    get_base(x) is a
-                    for x in df._mgr.arrays
-                    if isinstance(x.dtype, np.dtype)
+                    get_base(x.values) is a
+                    for x in df._mgr.blocks
+                    if isinstance(x.values.dtype, np.dtype)
                 )
                 == 1
             )
             assert (
                 sum(
-                    get_base(x) is b
-                    for x in df._mgr.arrays
-                    if isinstance(x.dtype, np.dtype)
+                    get_base(x.values) is b
+                    for x in df._mgr.blocks
+                    if isinstance(x.values.dtype, np.dtype)
                 )
                 == 1
             )
@@ -3045,7 +3045,7 @@ def test_construction_from_ndarray_datetimelike(self):
         # constructed from 2D ndarray
         arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3)
         df = DataFrame(arr)
-        assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays)
+        assert all(isinstance(block.values, DatetimeArray) for block in df._mgr.blocks)
 
     def test_construction_from_ndarray_with_eadtype_mismatched_columns(self):
         arr = np.random.default_rng(2).standard_normal((10, 2))
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -285,7 +285,7 @@ def test_read_only_buffer_source_agg(agg):
             "species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
         }
     )
-    df._mgr.arrays[0].flags.writeable = False
+    df._mgr.blocks[0].values.flags.writeable = False
 
     result = df.groupby(["species"]).agg({"sepal_length": agg})
     expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -114,7 +114,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, index_or_series_or_array
         if frame_or_series is Series:
             values = obj.values
         else:
-            values = obj._mgr.arrays[0]
+            values = obj._mgr.blocks[0].values
 
         if frame_or_series is Series:
             obj.iloc[:2] = index_or_series_or_array(arr[2:])
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py

Original file line number	Diff line number	Diff line change
`@@ -531,8 +531,8 @@ def shares_memory(left, right) -> bool:`
`531`	`531`	`left._mask, right._mask`
`532`	`532`	`)`
`533`	`533`
`534`		`- if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1:`
`535`		`- arr = left._mgr.arrays[0]`
	`534`	`+ if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1:`
	`535`	`+ arr = left._mgr.blocks[0].values`
`536`	`536`	`return shares_memory(arr, right)`
`537`	`537`
`538`	`538`	`raise NotImplementedError(type(left), type(right))`
Original file line number	Diff line number	Diff line change
`@@ -285,7 +285,7 @@ def test_read_only_buffer_source_agg(agg):`
`285`	`285`	`"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],`
`286`	`286`	`}`
`287`	`287`	`)`
`288`		`- df._mgr.arrays[0].flags.writeable = False`
	`288`	`+ df._mgr.blocks[0].values.flags.writeable = False`
`289`	`289`
`290`	`290`	`result = df.groupby(["species"]).agg({"sepal_length": agg})`
`291`	`291`	`expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})`