Skip to content

TYP: require Index objects earlier in internals #33100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 4, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions pandas/core/internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
BlockManager,
SingleBlockManager,
concatenate_block_managers,
create_block_manager_from_arrays,
create_block_manager_from_blocks,
)

__all__ = [
Expand All @@ -40,6 +38,4 @@
"BlockManager",
"SingleBlockManager",
"concatenate_block_managers",
"create_block_manager_from_arrays",
"create_block_manager_from_blocks",
]
23 changes: 13 additions & 10 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
ABCDataFrame,
ABCDatetimeIndex,
ABCIndexClass,
ABCPeriodIndex,
ABCSeries,
ABCTimedeltaIndex,
)
Expand All @@ -44,7 +43,7 @@
get_objs_combined_axis,
union_indexes,
)
from pandas.core.internals import (
from pandas.core.internals.managers import (
create_block_manager_from_arrays,
create_block_manager_from_blocks,
)
Expand All @@ -53,12 +52,16 @@
# BlockManager Interface


def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None, verify_integrity=True):
def arrays_to_mgr(
arrays, arr_names, index, columns, dtype=None, verify_integrity: bool = True
):
"""
Segregate Series based on type and coerce into matrices.

Needs to handle a lot of exceptional cases.
"""
arr_names = ensure_index(arr_names)

if verify_integrity:
# figure out the index, if necessary
if index is None:
Expand All @@ -74,6 +77,8 @@ def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None, verify_integrit
# from BlockManager perspective
axes = [columns, index]

assert isinstance(columns, Index), type(columns)
assert isinstance(index, Index), type(index)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ensure_index on L70 and L75 should return an Index. can you type ensure_index instead of adding asserts

return create_block_manager_from_arrays(arrays, arr_names, axes)


Expand Down Expand Up @@ -163,7 +168,8 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
values = [values]

if columns is None:
columns = list(range(len(values)))
columns = Index(range(len(values)))

return arrays_to_mgr(values, columns, index, columns, dtype=dtype)

# by definition an array here
Expand Down Expand Up @@ -209,6 +215,8 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
else:
block_values = [values]

assert isinstance(columns, Index), type(columns)
assert isinstance(index, Index), type(index)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar comment as above, can we type _get_axes instead of adding asserts.

return create_block_manager_from_blocks(block_values, [columns, index])


Expand Down Expand Up @@ -635,12 +643,7 @@ def sanitize_index(data, index: Index):
if len(data) != len(index):
raise ValueError("Length of values does not match length of index")

if isinstance(data, ABCIndexClass):
pass
elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)):
data = data._values

elif isinstance(data, np.ndarray):
if isinstance(data, np.ndarray):

# coerce datetimelike types
if data.dtype.kind in ["M", "m"]:
Expand Down
31 changes: 19 additions & 12 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def _verify_integrity(self) -> None:
tot_items = sum(len(x.mgr_locs) for x in self.blocks)
for block in self.blocks:
if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
construction_error(tot_items, block.shape[1:], self.axes)
raise construction_error(tot_items, block.shape[1:], self.axes)
if len(self.items) != tot_items:
raise AssertionError(
"Number of manager items must equal union of "
Expand Down Expand Up @@ -1657,7 +1657,7 @@ def concat(self, to_concat, new_axis: Index) -> "SingleBlockManager":
# Constructor Helpers


def create_block_manager_from_blocks(blocks, axes):
def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
try:
if len(blocks) == 1 and not isinstance(blocks[0], Block):
# if blocks[0] is of length 0, return empty blocks
Expand All @@ -1678,18 +1678,23 @@ def create_block_manager_from_blocks(blocks, axes):
except ValueError as e:
blocks = [getattr(b, "values", b) for b in blocks]
tot_items = sum(b.shape[0] for b in blocks)
construction_error(tot_items, blocks[0].shape[1:], axes, e)
raise construction_error(tot_items, blocks[0].shape[1:], axes, e)


def create_block_manager_from_arrays(arrays, names, axes):
def create_block_manager_from_arrays(
arrays, names: Index, axes: List[Index]
) -> BlockManager:
assert isinstance(names, Index)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory mypy will flag violations here already for us, so superfluous to have these here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simonjayhawkins is this correct? in a previous pass I found there were annotation-defying types being passed in this function that mypy had not caught

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, @WillAyd is correct, but in practice we probably need the run-time asserts to be sure. probably better to fail than potentially produce incorrect results.

in a previous pass I found there were annotation-defying types being passed in this function that mypy had not caught

if a variable is Any or the callsite is not being checked, then we can pass the wrong type.

assert isinstance(axes, list)
assert all(isinstance(x, Index) for x in axes)

try:
blocks = form_blocks(arrays, names, axes)
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except ValueError as e:
construction_error(len(arrays), arrays[0].shape, axes, e)
raise construction_error(len(arrays), arrays[0].shape, axes, e)


def construction_error(tot_items, block_shape, axes, e=None):
Expand All @@ -1704,23 +1709,25 @@ def construction_error(tot_items, block_shape, axes, e=None):
if len(implied) <= 2:
implied = implied[::-1]

# We return the exception object instead of raising it so that we
# can raise it in the caller; mypy plays better with that
if passed == implied and e is not None:
raise e
return e
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than do this I think would be better to just mark this function with typing.NoReturn as the return value

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i recall that fails on our min version build.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

elsewhere (in particular in indexes.base) we've moved towards this pattern

if block_shape[0] == 0:
raise ValueError("Empty data passed with indices specified.")
raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
return ValueError("Empty data passed with indices specified.")
return ValueError(f"Shape of passed values is {passed}, indices imply {implied}")


# -----------------------------------------------------------------------


def form_blocks(arrays, names, axes):
def form_blocks(arrays, names: Index, axes) -> List[Block]:
# put "leftover" items in float bucket, where else?
# generalize?
items_dict = defaultdict(list)
items_dict: Dict[str, List] = defaultdict(list)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this throwing an error? I don't think this is required but if so should type using typing.DefaultDict

extra_locs = []

names_idx = ensure_index(names)
names_idx = names
if names_idx.equals(axes[0]):
names_indexer = np.arange(len(names_idx))
else:
Expand All @@ -1738,7 +1745,7 @@ def form_blocks(arrays, names, axes):
block_type = get_block_type(v)
items_dict[block_type.__name__].append((i, k, v))

blocks = []
blocks: List[Block] = []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just as FYI but newer versions of mypy technically have better inferencing capabilities for empty sequence assignment like this. @simonjayhawkins

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id be happy to remove this; is that the suggested course of action?

if len(items_dict["FloatBlock"]):
float_blocks = _multi_blockify(items_dict["FloatBlock"])
blocks.extend(float_blocks)
Expand Down