Skip to content

Commit 9281191

Browse files
jbrockmendelfeefladder
authored andcommitted
BUG: concat with ArrayManager not making copy (pandas-dev#42797)
1 parent 6a2f7b9 commit 9281191

File tree

3 files changed

+48
-23
lines changed

3 files changed

+48
-23
lines changed

pandas/core/internals/concat.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,15 @@ def _concatenate_array_managers(
9494
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
9595
for j in range(len(mgrs[0].arrays))
9696
]
97-
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
9897
else:
9998
# concatting along the columns -> combine reindexed arrays in a single manager
10099
assert concat_axis == 0
101100
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
102-
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
101+
if copy:
102+
arrays = [x.copy() for x in arrays]
103+
104+
new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
105+
return new_mgr
103106

104107

105108
def concat_arrays(to_concat: list) -> ArrayLike:

pandas/tests/reshape/concat/test_concat.py

+30-21
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
import pandas.util._test_decorators as td
12-
1311
import pandas as pd
1412
from pandas import (
1513
DataFrame,
@@ -43,40 +41,51 @@ def test_append_concat(self):
4341
assert isinstance(result.index, PeriodIndex)
4442
assert result.index[0] == s1.index[0]
4543

46-
# TODO(ArrayManager) using block internals to verify, needs rewrite
47-
@td.skip_array_manager_invalid_test
48-
def test_concat_copy(self):
44+
def test_concat_copy(self, using_array_manager):
4945
df = DataFrame(np.random.randn(4, 3))
5046
df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1))
5147
df3 = DataFrame({5: "foo"}, index=range(4))
5248

5349
# These are actual copies.
5450
result = concat([df, df2, df3], axis=1, copy=True)
5551

56-
for b in result._mgr.blocks:
57-
assert b.values.base is None
52+
for arr in result._mgr.arrays:
53+
assert arr.base is None
5854

5955
# These are the same.
6056
result = concat([df, df2, df3], axis=1, copy=False)
6157

62-
for b in result._mgr.blocks:
63-
if b.dtype.kind == "f":
64-
assert b.values.base is df._mgr.blocks[0].values.base
65-
elif b.dtype.kind in ["i", "u"]:
66-
assert b.values.base is df2._mgr.blocks[0].values.base
67-
elif b.is_object:
68-
assert b.values.base is not None
58+
for arr in result._mgr.arrays:
59+
if arr.dtype.kind == "f":
60+
assert arr.base is df._mgr.arrays[0].base
61+
elif arr.dtype.kind in ["i", "u"]:
62+
assert arr.base is df2._mgr.arrays[0].base
63+
elif arr.dtype == object:
64+
if using_array_manager:
65+
# we get the same array object, which has no base
66+
assert arr is df3._mgr.arrays[0]
67+
else:
68+
assert arr.base is not None
6969

7070
# Float block was consolidated.
7171
df4 = DataFrame(np.random.randn(4, 1))
7272
result = concat([df, df2, df3, df4], axis=1, copy=False)
73-
for b in result._mgr.blocks:
74-
if b.dtype.kind == "f":
75-
assert b.values.base is None
76-
elif b.dtype.kind in ["i", "u"]:
77-
assert b.values.base is df2._mgr.blocks[0].values.base
78-
elif b.is_object:
79-
assert b.values.base is not None
73+
for arr in result._mgr.arrays:
74+
if arr.dtype.kind == "f":
75+
if using_array_manager:
76+
# this is a view on some array in either df or df4
77+
assert any(
78+
np.shares_memory(arr, other)
79+
for other in df._mgr.arrays + df4._mgr.arrays
80+
)
81+
else:
82+
# the block was consolidated, so we got a copy anyway
83+
assert arr.base is None
84+
elif arr.dtype.kind in ["i", "u"]:
85+
assert arr.base is df2._mgr.arrays[0].base
86+
elif arr.dtype == object:
87+
# this is a view on df3
88+
assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays)
8089

8190
def test_concat_with_group_keys(self):
8291
# axis=0

pandas/tests/reshape/concat/test_dataframe.py

+13
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,16 @@ def test_concat_duplicates_in_index_with_keys(self):
192192
expected = DataFrame(data=data, index=mi)
193193
tm.assert_frame_equal(result, expected)
194194
tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))
195+
196+
@pytest.mark.parametrize("ignore_index", [True, False])
197+
@pytest.mark.parametrize("order", ["C", "F"])
198+
@pytest.mark.parametrize("axis", [0, 1])
199+
def test_concat_copies(self, axis, order, ignore_index):
200+
# based on asv ConcatDataFrames
201+
df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order))
202+
203+
res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)
204+
205+
for arr in res._iter_column_arrays():
206+
for arr2 in df._iter_column_arrays():
207+
assert not np.shares_memory(arr, arr2)

0 commit comments

Comments
 (0)