From 61983d8747fd589f07fb3a1945a9af94c3850905 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Feb 2021 12:47:21 +0100 Subject: [PATCH 01/12] [ArrayManager] DataFrame constructors --- pandas/core/frame.py | 60 +++++++++++++------ pandas/core/generic.py | 2 + pandas/core/internals/construction.py | 39 +++++++----- .../frame/constructors/test_from_records.py | 10 +++- 4 files changed, 77 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 131a96d10a6d0..2de39dc7c4330 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -563,29 +563,41 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr - if isinstance(data, (BlockManager, ArrayManager)): - if index is None and columns is None and dtype is None and copy is False: - # GH#33357 fastpath - NDFrame.__init__(self, data) - return + if ( + index is None + and columns is None + and dtype is None + and copy is False + and isinstance(data, (BlockManager, ArrayManager)) + ): + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + manager = get_option("mode.data_manager") + + if isinstance(data, (BlockManager, ArrayManager)): mgr = self._init_mgr( data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype, typ=manager) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords # masked recarray if isinstance(data, mrecords.MaskedRecords): - mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) + mgr = masked_rec_array_to_mgr( + data, index, columns, dtype, copy, typ=manager + ) # a masked array else: data = sanitize_masked_array(data) - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -593,11 +605,15 @@ def __init__( data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype, typ=manager) elif getattr(data, "name", None) is not None: - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) + mgr = init_dict( + {data.name: data}, index, columns, dtype=dtype, typ=manager + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) # For data is list-like, or Iterable (will consume into list) elif is_list_like(data): @@ -610,11 +626,15 @@ def __init__( arrays, columns, index = nested_data_to_arrays( data, columns, index, dtype ) - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr( + arrays, columns, index, columns, dtype=dtype, typ=manager + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) else: - mgr = init_dict({}, index, columns, dtype=dtype) + mgr = init_dict({}, index, columns, dtype=dtype, typ=manager) # For data is scalar else: if index is None or columns is None: @@ -631,18 +651,19 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + mgr = arrays_to_mgr( + values, columns, index, columns, dtype=None, typ=manager + ) else: values = construct_2d_arraylike_from_scalar( data, len(index), len(columns), dtype, copy ) mgr = init_ndarray( - values, index, columns, dtype=values.dtype, copy=False + values, index, columns, dtype=values.dtype, copy=False, typ=manager ) # ensure correct Manager type according to settings - manager = get_option("mode.data_manager") mgr = mgr_to_mgr(mgr, typ=manager) NDFrame.__init__(self, mgr) @@ -1970,7 +1991,8 @@ def from_records( arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) + manager = get_option("mode.data_manager") + mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager) return cls(mgr) @@ -2177,6 +2199,7 @@ def _from_arrays( if dtype is not None: dtype = pandas_dtype(dtype) + manager = get_option("mode.data_manager") mgr = arrays_to_mgr( arrays, columns, @@ -2184,6 +2207,7 @@ def _from_arrays( columns, dtype=dtype, verify_integrity=verify_integrity, + typ=manager, ) return cls(mgr) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1b7c02cd7a05b..68a6b270a66f8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -138,6 +138,7 @@ ArrayManager, BlockManager, ) +from pandas.core.internals.construction import mgr_to_mgr from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat @@ -5752,6 +5753,7 @@ def _to_dict_of_blocks(self, copy: bool_t = True): Internal ONLY - only works for BlockManager """ mgr = self._mgr + mgr = mgr_to_mgr(mgr, "block") mgr = cast(BlockManager, mgr) return { k: self._constructor(v).__finalize__(self) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index eb1a7a355f313..221c8c1ec4a53 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -71,7 +71,9 @@ get_objs_combined_axis, union_indexes, ) +from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.managers import ( + BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks, ) @@ -90,6 +92,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, + typ: Optional[str] = None, ): """ Segregate Series based on type and coerce into matrices. @@ -116,11 +119,16 @@ def arrays_to_mgr( # from BlockManager perspective axes = [columns, index] - return create_block_manager_from_arrays(arrays, arr_names, axes) + if typ == "block": + return create_block_manager_from_arrays(arrays, arr_names, axes) + elif typ == "array": + return ArrayManager(arrays, [index, columns]) + else: + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") def masked_rec_array_to_mgr( - data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool + data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool, typ=None ): """ Extract from a masked rec array and create the manager. @@ -154,7 +162,7 @@ def masked_rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) + mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ) if copy: mgr = mgr.copy() @@ -166,11 +174,6 @@ def mgr_to_mgr(mgr, typ: str): Convert to specific type of Manager. Does not copy if the type is already correct. Does not guarantee a copy otherwise. """ - from pandas.core.internals import ( - ArrayManager, - BlockManager, - ) - new_mgr: Manager if typ == "block": @@ -178,7 +181,12 @@ def mgr_to_mgr(mgr, typ: str): new_mgr = mgr else: new_mgr = arrays_to_mgr( - mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None + mgr.arrays, + mgr.axes[0], + mgr.axes[1], + mgr.axes[0], + dtype=None, + typ="block", ) elif typ == "array": if isinstance(mgr, ArrayManager): @@ -187,7 +195,7 @@ def mgr_to_mgr(mgr, typ: str): arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))] new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) else: - raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'") + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") return new_mgr @@ -195,7 +203,9 @@ def mgr_to_mgr(mgr, typ: str): # DataFrame Constructor Interface -def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): +def init_ndarray( + values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str +): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): @@ -224,7 +234,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if columns is None: columns = Index(range(len(values))) - return arrays_to_mgr(values, columns, index, columns, dtype=dtype) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) # by definition an array here # the dtypes will be coerced to a single dtype @@ -277,7 +287,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): return create_block_manager_from_blocks(block_values, [columns, index]) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None, typ=None): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. @@ -321,7 +331,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ) def nested_data_to_arrays( @@ -415,6 +425,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) + val = extract_array(val, extract_numpy=True) else: if isinstance(val, dict): if oindex is None: diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 0d36f3bd80e26..87ca5d1a8a170 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -184,7 +184,10 @@ def test_from_records_bad_index_column(self): tm.assert_index_equal(df1.index, Index(df.C)) # should fail - msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)" + msg = ( + r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)|" + "Passed arrays should have the same length as the rows Index: 10 vs 1 rows" + ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(df, index=[2]) with pytest.raises(KeyError, match=r"^2$"): @@ -259,7 +262,10 @@ def test_from_records_to_records(self): tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length - msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" + msg = ( + r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)|" + "Passed arrays should have the same length as the rows Index: 2 vs 1 rows" + ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) From 1d0315f6f7c8e99e07d5fda2ae06191e1ab6786d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Feb 2021 14:23:00 +0100 Subject: [PATCH 02/12] clean-up signatures --- pandas/core/internals/construction.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 221c8c1ec4a53..a940e1935a13a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -128,7 +128,7 @@ def arrays_to_mgr( def masked_rec_array_to_mgr( - data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool, typ=None + data: MaskedRecords, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str ): """ Extract from a masked rec array and create the manager. @@ -181,12 +181,7 @@ def mgr_to_mgr(mgr, typ: str): new_mgr = mgr else: new_mgr = arrays_to_mgr( - mgr.arrays, - mgr.axes[0], - mgr.axes[1], - mgr.axes[0], - dtype=None, - typ="block", + mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block" ) elif typ == "array": if isinstance(mgr, ArrayManager): @@ -287,7 +282,7 @@ def init_ndarray( return create_block_manager_from_blocks(block_values, [columns, index]) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None, typ=None): +def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. From ffc831446878b098672f4ed1c36d26117823a3e7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Feb 2021 14:42:17 +0100 Subject: [PATCH 03/12] 'fix' for PandasArrays --- pandas/core/internals/construction.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index a940e1935a13a..892ddfd335b5b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -420,7 +420,10 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) - val = extract_array(val, extract_numpy=True) + # extract_array should be preferred? But that gives failures for + # `extension/test_numpy.py` + # val = extract_array(val, extract_numpy=True) + val = val._values else: if isinstance(val, dict): if oindex is None: From 46e73c80b497fc94df9c952a3f27facf7ac26286 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Feb 2021 15:10:28 +0100 Subject: [PATCH 04/12] tests --- .github/workflows/ci.yml | 3 +++ pandas/core/internals/array_manager.py | 8 +++++- .../frame/constructors/test_from_records.py | 6 +++++ pandas/tests/frame/test_constructors.py | 25 ++++++++++++++++--- 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 461363d295f6a..9240cf4d4519a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,6 +153,9 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/frame/test_constructors.py --array-manager + pytest pandas/tests/frame/constructors/ --array-manager + pytest pandas/tests/arithmetic/ --array-manager pytest pandas/tests/reshape/merge --array-manager diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index d38d278e89a67..0d73acd080f7a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -741,7 +741,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False value = extract_array(value, extract_numpy=True) if value.ndim == 2: - value = value[0, :] + if value.shape[0] == 1: + value = value[0, :] + else: + raise ValueError( + f"expected 1D array, got array with shape {value.shape}" + ) + # TODO self.arrays can be empty # assert len(value) == len(self.arrays[0]) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 87ca5d1a8a170..98713cc690a6c 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,6 +6,7 @@ import pytz from pandas.compat import is_platform_little_endian +import pandas.util._test_decorators as td from pandas import ( CategoricalIndex, @@ -118,6 +119,8 @@ def test_from_records_sequencelike(self): tm.assert_series_equal(result["C"], df["C"]) tm.assert_series_equal(result["E1"], df["E1"].astype("float64")) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records + def test_from_records_sequencelike_empty(self): # empty case result = DataFrame.from_records([], columns=["foo", "bar", "baz"]) assert len(result) == 0 @@ -211,6 +214,7 @@ def __iter__(self): expected = DataFrame.from_records(tups) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_len0_with_columns(self): # GH#2633 result = DataFrame.from_records([], index="foo", columns=["foo", "bar"]) @@ -392,6 +396,7 @@ def create_dict(order_id): result = DataFrame.from_records(documents, index=["order_id", "quantity"]) assert result.index.names == ("order_id", "quantity") + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_misc_brokenness(self): # GH#2179 @@ -430,6 +435,7 @@ def test_from_records_misc_brokenness(self): ) tm.assert_series_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_empty(self): # GH#3562 result = DataFrame.from_records([], columns=["a", "b", "c"]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 14adc8a992609..ba2c6daa11137 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -18,6 +18,7 @@ import pytz from pandas.compat import np_version_under1p19 +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import ( @@ -159,7 +160,10 @@ def test_constructor_cast_failure(self): df["foo"] = np.ones((4, 2)).tolist() # this is not ok - msg = "Wrong number of items passed 2, placement implies 1" + msg = ( + "Wrong number of items passed 2, placement implies 1" + "|expected 1D array, got array" + ) with pytest.raises(ValueError, match=msg): df["test"] = np.ones((4, 2)) @@ -174,12 +178,15 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view(self): + def test_constructor_dtype_nocast_view_dataframe(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) should_be_view[0][0] = 99 assert df.values[0, 0] == 99 + @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? + def test_constructor_dtype_nocast_view_2d_array(self): + df = DataFrame([[1, 2]]) should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 @@ -1931,6 +1938,8 @@ def test_constructor_frame_copy(self, float_frame): assert (cop["A"] == 5).all() assert not (float_frame["A"] == 5).all() + # TODO(ArrayManager) keep view on 2D array? + @td.skip_array_manager_not_yet_implemented def test_constructor_ndarray_copy(self, float_frame): df = DataFrame(float_frame.values) @@ -1941,6 +1950,8 @@ def test_constructor_ndarray_copy(self, float_frame): float_frame.values[6] = 6 assert not (df.values[6] == 6).all() + # TODO(ArrayManager) keep view on Series? + @td.skip_array_manager_not_yet_implemented def test_constructor_series_copy(self, float_frame): series = float_frame._series @@ -2054,7 +2065,10 @@ def test_from_nested_listlike_mixed_types(self): def test_construct_from_listlikes_mismatched_lengths(self): # invalid (shape) - msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" + msg = ( + r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)|" + "Passed arrays should have the same length as the rows Index" + ) with pytest.raises(ValueError, match=msg): DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) @@ -2106,6 +2120,8 @@ def test_check_dtype_empty_numeric_column(self, dtype): assert data.b.dtype == dtype + # TODO(ArrayManager) astype to bytes dtypes does not yet give object dtype + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) @@ -2209,7 +2225,8 @@ class DatetimeSubclass(datetime): def test_with_mismatched_index_length_raises(self): # GH#33437 dti = date_range("2016-01-01", periods=3, tz="US/Pacific") - with pytest.raises(ValueError, match="Shape of passed values"): + msg = "Shape of passed values|Passed arrays should have the same length" + with pytest.raises(ValueError, match=msg): DataFrame(dti, index=range(4)) def test_frame_ctor_datetime64_column(self): From 3e108df446af36b336ee5e7240a43c931bb991fa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Feb 2021 16:02:58 +0100 Subject: [PATCH 05/12] ensure datetime-like array --- pandas/core/internals/construction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 892ddfd335b5b..7acbc6ad8068b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -61,6 +61,7 @@ ) from pandas.core.arrays import Categorical from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, extract_array, sanitize_array, ) @@ -110,7 +111,8 @@ def arrays_to_mgr( # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) - + if typ == "array": + arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] columns = ensure_index(columns) else: columns = ensure_index(columns) From 8726d421ceb7c858ec309534140976d4035ce560 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 14:29:21 +0100 Subject: [PATCH 06/12] small clean-up - additional comments --- pandas/core/frame.py | 2 ++ pandas/core/generic.py | 1 + pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/construction.py | 5 +++-- pandas/tests/frame/test_constructors.py | 2 +- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fffb99cf1d5d6..8acfcc9ad70b3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -564,6 +564,8 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) if ( index is None and columns is None diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 68a6b270a66f8..5b83ada85d3f6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5753,6 +5753,7 @@ def _to_dict_of_blocks(self, copy: bool_t = True): Internal ONLY - only works for BlockManager """ mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well mgr = mgr_to_mgr(mgr, "block") mgr = cast(BlockManager, mgr) return { diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1c2267a0ca691..8bb57ff0ae9a0 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -745,7 +745,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False value = value[0, :] else: raise ValueError( - f"expected 1D array, got array with shape {value.shape}" + f"Expected a 1D array, got an array with shape {value.shape}" ) # TODO self.arrays can be empty diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7acbc6ad8068b..cd9c370165f8a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -422,8 +422,9 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) - # extract_array should be preferred? But that gives failures for - # `extension/test_numpy.py` + # TODO extract_array should be preferred, but that gives failures for + # `extension/test_numpy.py` (extract_array will convert numpy arrays + # to PandasArray) # val = extract_array(val, extract_numpy=True) val = val._values else: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ba2c6daa11137..95c808326235b 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -162,7 +162,7 @@ def test_constructor_cast_failure(self): # this is not ok msg = ( "Wrong number of items passed 2, placement implies 1" - "|expected 1D array, got array" + "|Expected a 1D array, got an array with shape \\(4, 2\\)" ) with pytest.raises(ValueError, match=msg): df["test"] = np.ones((4, 2)) From 6e171838fd73366b9f472ad257801d45e155a7ab Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 16:45:01 +0100 Subject: [PATCH 07/12] use string join for msg --- .../frame/constructors/test_from_records.py | 16 ++++++++++------ pandas/tests/frame/test_constructors.py | 16 ++++++++++------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 98713cc690a6c..3ead6b722713c 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -187,9 +187,11 @@ def test_from_records_bad_index_column(self): tm.assert_index_equal(df1.index, Index(df.C)) # should fail - msg = ( - r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)|" - "Passed arrays should have the same length as the rows Index: 10 vs 1 rows" + msg = "|".join( + [ + r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)", + "Passed arrays should have the same length as the rows Index: 10 vs 1", + ] ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(df, index=[2]) @@ -266,9 +268,11 @@ def test_from_records_to_records(self): tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length - msg = ( - r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)|" - "Passed arrays should have the same length as the rows Index: 2 vs 1 rows" + msg = "|".join( + [ + r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)", + "Passed arrays should have the same length as the rows Index: 2 vs 1", + ] ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 95c808326235b..19dd85d3eeeb6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -160,9 +160,11 @@ def test_constructor_cast_failure(self): df["foo"] = np.ones((4, 2)).tolist() # this is not ok - msg = ( - "Wrong number of items passed 2, placement implies 1" - "|Expected a 1D array, got an array with shape \\(4, 2\\)" + msg = "|".join( + [ + "Wrong number of items passed 2, placement implies 1", + "Expected a 1D array, got an array with shape \\(4, 2\\)", + ] ) with pytest.raises(ValueError, match=msg): df["test"] = np.ones((4, 2)) @@ -2065,9 +2067,11 @@ def test_from_nested_listlike_mixed_types(self): def test_construct_from_listlikes_mismatched_lengths(self): # invalid (shape) - msg = ( - r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)|" - "Passed arrays should have the same length as the rows Index" + msg = "|".join( + [ + r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)", + "Passed arrays should have the same length as the rows Index", + ] ) with pytest.raises(ValueError, match=msg): DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) From 8096665343e3aea8476ec5148ba45228936c27be Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Feb 2021 16:56:36 +0100 Subject: [PATCH 08/12] add github issue link to comment --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index cd9c370165f8a..974e3a961db8e 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -424,7 +424,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): val = val.reindex(index, copy=False) # TODO extract_array should be preferred, but that gives failures for # `extension/test_numpy.py` (extract_array will convert numpy arrays - # to PandasArray) + # to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021 # val = extract_array(val, extract_numpy=True) val = val._values else: From 54d36ab4976831de3222c1ba7cc5bccc9bc24a72 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Mar 2021 10:19:52 +0100 Subject: [PATCH 09/12] move wrapping inside ArrayManager constructor --- pandas/core/internals/array_manager.py | 1 + pandas/core/internals/construction.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b9638f199c927..836c85c106d42 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -113,6 +113,7 @@ def __init__( if verify_integrity: self._axes = [ensure_index(ax) for ax in axes] + self.arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] self._verify_integrity() def make_empty(self: T, axes=None) -> T: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 618ed4830260f..296d0dbd50cf6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -59,7 +59,6 @@ ) from pandas.core.arrays import Categorical from pandas.core.construction import ( - ensure_wrapped_if_datetimelike, extract_array, sanitize_array, ) @@ -109,8 +108,7 @@ def arrays_to_mgr( # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) - if typ == "array": - arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] + columns = ensure_index(columns) else: columns = ensure_index(columns) From c56ffa892751349b6e053c60c1c76406e46263bf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Mar 2021 10:27:36 +0100 Subject: [PATCH 10/12] remove skip --- pandas/tests/groupby/test_groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b270539921c9c..8cbb9d2443cb2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1196,7 +1196,6 @@ def convert_force_pure(x): assert isinstance(result[0], Decimal) -@td.skip_array_manager_not_yet_implemented def test_groupby_dtype_inference_empty(): # GH 6733 df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")}) From 164387c8894a0b9c3aa7d98668e2faa77cdcee2c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Mar 2021 13:24:52 +0100 Subject: [PATCH 11/12] trigger ci From 143b57238b72b75758bcbdfa8236db153bacd653 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Mar 2021 14:03:36 +0100 Subject: [PATCH 12/12] add skip for rename copy --- pandas/tests/frame/methods/test_rename.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 677d862dfe077..462d588aff58f 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -170,6 +170,7 @@ def test_rename_multiindex(self): renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view def test_rename_nocopy(self, float_frame): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) renamed["foo"] = 1.0