-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: ArrayManager not respecting copy keyword #44889
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac23879
a9b5c6b
a605885
0895b0e
01c6ad2
fe0dc04
1bfdcc6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,6 @@ | |
) | ||
from pandas.core.dtypes.common import ( | ||
is_1d_only_ea_dtype, | ||
is_datetime64tz_dtype, | ||
is_datetime_or_timedelta_dtype, | ||
is_dtype_equal, | ||
is_extension_array_dtype, | ||
|
@@ -44,7 +43,6 @@ | |
is_named_tuple, | ||
is_object_dtype, | ||
) | ||
from pandas.core.dtypes.dtypes import ExtensionDtype | ||
from pandas.core.dtypes.generic import ( | ||
ABCDataFrame, | ||
ABCSeries, | ||
|
@@ -475,23 +473,16 @@ def dict_to_mgr( | |
keys = list(data.keys()) | ||
columns = Index(keys) | ||
arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] | ||
# GH#24096 need copy to be deep for datetime64tz case | ||
# TODO: See if we can avoid these copies | ||
arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays] | ||
arrays = [ | ||
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays | ||
] | ||
|
||
if copy: | ||
# arrays_to_mgr (via form_blocks) won't make copies for EAs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is no longer the case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is the case, it just isn't relevant |
||
# dtype attr check to exclude EADtype-castable strs | ||
arrays = [ | ||
x | ||
if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype) | ||
else x.copy() | ||
for x in arrays | ||
] | ||
# TODO: can we get rid of the dt64tz special case above? | ||
if typ == "block": | ||
# We only need to copy arrays that will not get consolidated, i.e. | ||
# only EA arrays | ||
arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays] | ||
else: | ||
# dtype check to exclude e.g. range objects, scalars | ||
arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] | ||
|
||
return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -421,10 +421,6 @@ def test_setitem_frame_duplicate_columns(self, using_array_manager, request): | |
# TODO(ArrayManager) .loc still overwrites | ||
expected["B"] = expected["B"].astype("int64") | ||
|
||
mark = pytest.mark.xfail( | ||
reason="Both 'A' columns get set with 3 instead of 0 and 3" | ||
) | ||
request.node.add_marker(mark) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this fixed by the code changes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I looked into this to understand why it was actually fixing this. And it's actually because the But so that is putting an identical array for each columns. And so if you then mutate one columns, all columns get set, if the columns are not copied. With the default of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will open a separate issue for this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -> #45369 |
||
else: | ||
# set these with unique columns to be extra-unambiguous | ||
expected[2] = expected[2].astype(np.int64) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2157,8 +2157,6 @@ def test_constructor_ndarray_copy(self, float_frame, using_array_manager): | |
arr[0, 0] = 1000 | ||
assert df.iloc[0, 0] == 1000 | ||
|
||
# TODO(ArrayManager) keep view on Series? | ||
@td.skip_array_manager_not_yet_implemented | ||
def test_constructor_series_copy(self, float_frame): | ||
series = float_frame._series | ||
|
||
|
@@ -2513,13 +2511,10 @@ def test_constructor_list_str_na(self, string_dtype): | |
def test_dict_nocopy( | ||
self, request, copy, any_numeric_ea_dtype, any_numpy_dtype, using_array_manager | ||
): | ||
if using_array_manager and not ( | ||
(any_numpy_dtype in (tm.STRING_DTYPES + tm.BYTES_DTYPES)) | ||
or ( | ||
any_numpy_dtype | ||
in (tm.DATETIME64_DTYPES + tm.TIMEDELTA64_DTYPES + tm.BOOL_DTYPES) | ||
and copy | ||
) | ||
if ( | ||
using_array_manager | ||
and not copy | ||
and not (any_numpy_dtype in (tm.STRING_DTYPES + tm.BYTES_DTYPES)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So is it correct to conclude from this remaining skip that this PR fixed the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. correct |
||
): | ||
# TODO(ArrayManager) properly honor copy keyword for dict input | ||
td.mark_array_manager_not_yet_implemented(request) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this related to the other changes (this is not behind an
if copy
check), or just an additional copy that is no longer needed nowadays?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this can be removed independently without breaking any tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This actually causes a change in behaviour:
In the above, the
dti
is not mutated, but with this branch it will be.Now, since I explicitly passed
copy=False
, I suppose this is a bug fix :) For other Index objects it will also mutate the original index.When the original PR (#24096) added this line of code to protect DatetimeIndex from getting mutated, the
copy
behaviour of DataFrame(..) might still have been different.