Skip to content

[ArrayManager] TST: run (+fix/skip) pandas/tests/frame/indexing tests #40323

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 12, 2021
Merged
9 changes: 1 addition & 8 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,7 @@ jobs:
run: |
source activate pandas-dev

pytest pandas/tests/frame/methods
pytest pandas/tests/frame/test_constructors.py
pytest pandas/tests/frame/test_*
pytest pandas/tests/frame/test_reductions.py
pytest pandas/tests/frame/
pytest pandas/tests/reductions/
pytest pandas/tests/generic/test_generic.py
pytest pandas/tests/arithmetic/
Expand All @@ -170,10 +167,6 @@ jobs:
pytest pandas/tests/series/test_*

# indexing subset (temporary since other tests don't pass yet)
pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean
pytest pandas/tests/frame/indexing/test_where.py
pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_multi_index
pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_listlike_indexer_duplicate_columns
pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups
pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column

Expand Down
6 changes: 1 addition & 5 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value):
# DataFrame into 1D array when loc is an integer
if isinstance(value, np.ndarray) and value.ndim == 2:
assert value.shape[1] == 1
value = value[0, :]
value = value[:, 0]

# TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
# but we should avoid that and pass directly the proper array
Expand Down Expand Up @@ -1168,10 +1168,6 @@ def axes(self):
def index(self) -> Index:
return self._axes[0]

@property
def array(self):
return self.arrays[0]

@property
def dtype(self):
return self.array.dtype
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/internals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ def isna(self: T, func) -> T:
class SingleDataManager(DataManager):
ndim = 1

@property
def array(self):
"""
Quick access to the backing array of the Block or SingleArrayManager.
"""
return self.arrays[0] # type: ignore[attr-defined]


def interleaved_dtype(dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
"""
Expand Down
63 changes: 46 additions & 17 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest

from pandas._libs import iNaT
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_integer

Expand Down Expand Up @@ -534,6 +535,7 @@ def test_getitem_setitem_integer_slice_keyerrors(self):
with pytest.raises(KeyError, match=r"^3$"):
df2.loc[3:11] = 0

@td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view
def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame):
sliced = float_string_frame.iloc[:, -3:]
assert sliced["D"].dtype == np.float64
Expand Down Expand Up @@ -592,6 +594,7 @@ def test_getitem_fancy_scalar(self, float_frame):
for idx in f.index[::5]:
assert ix[idx, col] == ts[idx]

@td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values
def test_setitem_fancy_scalar(self, float_frame):
f = float_frame
expected = float_frame.copy()
Expand Down Expand Up @@ -631,6 +634,7 @@ def test_getitem_fancy_boolean(self, float_frame):
expected = f.reindex(index=f.index[boolvec], columns=["C", "D"])
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values
def test_setitem_fancy_boolean(self, float_frame):
# from 2d, set with booleans
frame = float_frame.copy()
Expand Down Expand Up @@ -990,21 +994,29 @@ def test_iloc_row(self):
expected = df.loc[8:14]
tm.assert_frame_equal(result, expected)

# list of integers
result = df.iloc[[1, 2, 4, 6]]
expected = df.reindex(df.index[[1, 2, 4, 6]])
tm.assert_frame_equal(result, expected)

def test_iloc_row_slice_view(self, using_array_manager):
df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2))
original = df.copy()

# verify slice is view
# setting it makes it raise/warn
subset = df.iloc[slice(4, 8)]

msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
result[2] = 0.0
subset[2] = 0.0

exp_col = df[2].copy()
exp_col[4:8] = 0.0
exp_col = original[2].copy()
# TODO(ArrayManager) verify it is expected that the original didn't change
if not using_array_manager:
exp_col[4:8] = 0.0
tm.assert_series_equal(df[2], exp_col)

# list of integers
result = df.iloc[[1, 2, 4, 6]]
expected = df.reindex(df.index[[1, 2, 4, 6]])
tm.assert_frame_equal(result, expected)

def test_iloc_col(self):

df = DataFrame(np.random.randn(4, 10), columns=range(0, 20, 2))
Expand All @@ -1022,19 +1034,32 @@ def test_iloc_col(self):
expected = df.loc[:, 8:14]
tm.assert_frame_equal(result, expected)

# verify slice is view
# and that we are setting a copy
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
result[8] = 0.0

assert (df[8] == 0).all()

# list of integers
result = df.iloc[:, [1, 2, 4, 6]]
expected = df.reindex(columns=df.columns[[1, 2, 4, 6]])
tm.assert_frame_equal(result, expected)

def test_iloc_col_slice_view(self, using_array_manager):
df = DataFrame(np.random.randn(4, 10), columns=range(0, 20, 2))
original = df.copy()
subset = df.iloc[:, slice(4, 8)]

if not using_array_manager:
# verify slice is view
# and that we are setting a copy
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
subset[8] = 0.0

assert (df[8] == 0).all()
else:
# TODO(ArrayManager) verify this is the desired behaviour
subset[8] = 0.0
# subset changed
assert (subset[8] == 0).all()
# but df itself did not change (setitem replaces full column)
tm.assert_frame_equal(df, original)

def test_loc_duplicates(self):
# gh-17105

Expand Down Expand Up @@ -1218,7 +1243,7 @@ def test_setitem(self, uint64_frame):
)


def test_object_casting_indexing_wraps_datetimelike():
def test_object_casting_indexing_wraps_datetimelike(using_array_manager):
# GH#31649, check the indexing methods all the way down the stack
df = DataFrame(
{
Expand All @@ -1240,6 +1265,10 @@ def test_object_casting_indexing_wraps_datetimelike():
assert isinstance(ser.values[1], Timestamp)
assert isinstance(ser.values[2], pd.Timedelta)

if using_array_manager:
# remainder of the test checking BlockManager internals
return

mgr = df._mgr
mgr._rebuild_blknos_and_blklocs()
arr = mgr.fast_xs(0)
Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,17 @@ def test_insert_with_columns_dups(self):
)
tm.assert_frame_equal(df, exp)

def test_insert_item_cache(self):
def test_insert_item_cache(self, using_array_manager):
df = DataFrame(np.random.randn(4, 3))
ser = df[0]

with tm.assert_produces_warning(PerformanceWarning):
if using_array_manager:
expected_warning = None
else:
# with BlockManager warn about high fragmentation of single dtype
expected_warning = PerformanceWarning

with tm.assert_produces_warning(expected_warning):
for n in range(100):
df[n + 3] = df[1] * n

Expand Down
26 changes: 18 additions & 8 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas.core.dtypes.base import registry as ea_registry
from pandas.core.dtypes.common import (
is_categorical_dtype,
Expand Down Expand Up @@ -298,12 +300,12 @@ def test_setitem_dt64tz(self, timezone_frame):

# assert that A & C are not sharing the same base (e.g. they
# are copies)
b1 = df._mgr.blocks[1]
b2 = df._mgr.blocks[2]
tm.assert_extension_array_equal(b1.values, b2.values)
b1base = b1.values._data.base
b2base = b2.values._data.base
assert b1base is None or (id(b1base) != id(b2base))
v1 = df._mgr.arrays[1]
v2 = df._mgr.arrays[2]
tm.assert_extension_array_equal(v1, v2)
v1base = v1._data.base
v2base = v2._data.base
assert v1base is None or (id(v1base) != id(v2base))

# with nan
df2 = df.copy()
Expand Down Expand Up @@ -366,7 +368,7 @@ def test_setitem_frame_length_0_str_key(self, indexer):
expected["A"] = expected["A"].astype("object")
tm.assert_frame_equal(df, expected)

def test_setitem_frame_duplicate_columns(self):
def test_setitem_frame_duplicate_columns(self, using_array_manager):
# GH#15695
cols = ["A", "B", "C"] * 2
df = DataFrame(index=range(3), columns=cols)
Expand All @@ -382,6 +384,11 @@ def test_setitem_frame_duplicate_columns(self):
columns=cols,
dtype="object",
)
if using_array_manager:
# setitem replaces column so changes dtype
expected["C"] = expected["C"].astype("int64")
# TODO(ArrayManager) .loc still overwrites
expected["B"] = expected["B"].astype("int64")
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]])
Expand Down Expand Up @@ -628,6 +635,8 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected):


class TestDataFrameSetItemWithExpansion:
# TODO(ArrayManager) update parent (_maybe_update_cacher)
@td.skip_array_manager_not_yet_implemented
def test_setitem_listlike_views(self):
# GH#38148
df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]})
Expand Down Expand Up @@ -699,7 +708,7 @@ def test_setitem_with_expansion_categorical_dtype(self):

result1 = df["D"]
result2 = df["E"]
tm.assert_categorical_equal(result1._mgr._block.values, cat)
tm.assert_categorical_equal(result1._mgr.array, cat)

# sorting
ser.name = "E"
Expand Down Expand Up @@ -767,6 +776,7 @@ def inc(x):


class TestDataFrameSetItemBooleanMask:
@td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values
@pytest.mark.parametrize(
"mask_type",
[lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values],
Expand Down
36 changes: 30 additions & 6 deletions pandas/tests/frame/indexing/test_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,22 @@ def test_xs_keep_level(self):
result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False)
tm.assert_frame_equal(result, expected)

def test_xs_view(self):
def test_xs_view(self, using_array_manager):
# in 0.14 this will return a view if possible a copy otherwise, but
# this is numpy dependent

dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5))

dm.xs(2)[:] = 10
assert (dm.xs(2) == 10).all()
if using_array_manager:
# INFO(ArrayManager) with ArrayManager getting a row as a view is
# not possible
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
dm.xs(2)[:] = 20
assert not (dm.xs(2) == 20).any()
else:
dm.xs(2)[:] = 20
assert (dm.xs(2) == 20).all()


class TestXSWithMultiIndex:
Expand Down Expand Up @@ -327,10 +335,26 @@ def test_xs_droplevel_false(self):
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(result, expected)

def test_xs_droplevel_false_view(self):
def test_xs_droplevel_false_view(self, using_array_manager):
# GH#37832
df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"]))
result = df.xs("a", axis=1, drop_level=False)
df.values[0, 0] = 2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this version feels much clearer as to whats going on

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have a suggestion for how to then do it? (we can't rely on .values to mutate)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i guess what we really care about here is np.shares_memory(result._values, df.iloc[:, 0])

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the np.shares_memory check. And turned out that also with ArrayManager this actually gives a view. But it's the iloc modifying the parent that doesn't result in modifying the child result, when using ArrayManager, but also when using a mixed DataFrame. So added that as an explicit test case as well.

expected = DataFrame({"a": [2]})
# check that result still views the same data as df
assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values)
# modifying original df also modifies result when having a single block
df.iloc[0, 0] = 2
if not using_array_manager:
expected = DataFrame({"a": [2]})
else:
# TODO(ArrayManager) iloc does not update the array inplace using
# "split" path
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(result, expected)

# with mixed dataframe, modifying the parent doesn't modify result
# TODO the "split" path behaves differently here as with single block
df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"]))
result = df.xs("a", axis=1, drop_level=False)
df.iloc[0, 0] = 2
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(result, expected)