Skip to content

TYP: partial typing of masked array #31728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 15 additions & 19 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import numbers
from typing import TYPE_CHECKING, Any, List, Tuple, Type, Union
from typing import TYPE_CHECKING, List, Tuple, Type, Union
import warnings

import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas._typing import ArrayLike
from pandas.compat import set_function_name
from pandas.compat.numpy import function as nv

Expand Down Expand Up @@ -281,20 +282,15 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not mask.ndim == 1:
raise ValueError("mask must be a 1D array")

if copy:
values = values.copy()
mask = mask.copy()

self._data = values
self._mask = mask
self._dtype = BooleanDtype()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this could be a class attribute. no need to be assigned in constructor?

super().__init__(values, mask, copy=copy)

@property
def dtype(self):
def dtype(self) -> BooleanDtype:
return self._dtype

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "BooleanArray":
if dtype:
assert dtype == "boolean"
values, mask = coerce_to_array(scalars, copy=copy)
Expand All @@ -303,7 +299,7 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
@classmethod
def _from_sequence_of_strings(
cls, strings: List[str], dtype=None, copy: bool = False
):
) -> "BooleanArray":
def map_string(s):
if isna(s):
return s
Expand All @@ -317,18 +313,18 @@ def map_string(s):
scalars = [map_string(x) for x in strings]
return cls._from_sequence(scalars, dtype, copy)

def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, though, the "Any" is correct (I don't know what the typing should do here, but the signature of the base class would use "Any", so any place where _values_for_factorize is called would need to assume "Any")

data = self._data.astype("int8")
data[self._mask] = -1
return data, -1

@classmethod
def _from_factorized(cls, values, original: "BooleanArray"):
def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray":
return cls._from_sequence(values, dtype=original.dtype)

_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
# For BooleanArray inputs, we apply the ufunc to ._data
# and mask the result.
if method == "reduce":
Expand Down Expand Up @@ -373,7 +369,7 @@ def reconstruct(x):
else:
return reconstruct(result)

def __setitem__(self, key, value):
def __setitem__(self, key, value) -> None:
_is_scalar = is_scalar(value)
if _is_scalar:
value = [value]
Expand All @@ -387,7 +383,7 @@ def __setitem__(self, key, value):
self._data[key] = value
self._mask[key] = mask

def astype(self, dtype, copy=True):
def astype(self, dtype, copy: bool = True) -> ArrayLike:
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.

Expand All @@ -402,8 +398,8 @@ def astype(self, dtype, copy=True):

Returns
-------
array : ndarray or ExtensionArray
NumPy ndarray, BooleanArray or IntergerArray with 'dtype' for its dtype.
ndarray or ExtensionArray
NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype.

Raises
------
Expand Down Expand Up @@ -693,7 +689,7 @@ def cmp_method(self, other):
name = f"__{op.__name__}"
return set_function_name(cmp_method, name, cls)

def _reduce(self, name, skipna=True, **kwargs):
def _reduce(self, name: str, skipna: bool = True, **kwargs):

if name in {"any", "all"}:
return getattr(self, name)(skipna=skipna, **kwargs)
Expand Down Expand Up @@ -722,7 +718,7 @@ def _reduce(self, name, skipna=True, **kwargs):

return result

def _maybe_mask_result(self, result, mask, other, op_name):
def _maybe_mask_result(self, result, mask, other, op_name: str):
"""
Parameters
----------
Expand Down
29 changes: 12 additions & 17 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import numbers
from typing import TYPE_CHECKING, Any, Dict, Tuple, Type, Union
from typing import TYPE_CHECKING, Tuple, Type, Union
import warnings

import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas._typing import ArrayLike
from pandas.compat import set_function_name
from pandas.util._decorators import cache_readonly

Expand Down Expand Up @@ -347,13 +348,7 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
"mask should be boolean numpy array. Use "
"the 'integer_array' function instead"
)

if copy:
values = values.copy()
mask = mask.copy()

self._data = values
self._mask = mask
super().__init__(values, mask, copy=copy)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which superclass performs this logic?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BaseMaskedArray?


@classmethod
def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "IntegerArray":
Expand Down Expand Up @@ -417,7 +412,7 @@ def reconstruct(x):
else:
return reconstruct(result)

def __setitem__(self, key, value):
def __setitem__(self, key, value) -> None:
_is_scalar = is_scalar(value)
if _is_scalar:
value = [value]
Expand All @@ -431,9 +426,9 @@ def __setitem__(self, key, value):
self._data[key] = value
self._mask[key] = mask

def astype(self, dtype, copy=True):
def astype(self, dtype, copy: bool = True) -> ArrayLike:
"""
Cast to a NumPy array or IntegerArray with 'dtype'.
Cast to a NumPy array or ExtensionArray with 'dtype'.

Parameters
----------
Expand All @@ -446,8 +441,8 @@ def astype(self, dtype, copy=True):

Returns
-------
array : ndarray or IntegerArray
NumPy ndarray or IntergerArray with 'dtype' for its dtype.
ndarray or ExtensionArray
NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype.

Raises
------
Expand Down Expand Up @@ -488,7 +483,7 @@ def _ndarray_values(self) -> np.ndarray:
"""
return self._data

def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
# TODO: https://github.com/pandas-dev/pandas/issues/30037
# use masked algorithms, rather than object-dtype / np.nan.
return self.to_numpy(na_value=np.nan), np.nan
Expand Down Expand Up @@ -565,7 +560,7 @@ def cmp_method(self, other):
name = f"__{op.__name__}__"
return set_function_name(cmp_method, name, cls)

def _reduce(self, name, skipna=True, **kwargs):
def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

Expand All @@ -592,7 +587,7 @@ def _reduce(self, name, skipna=True, **kwargs):

return result

def _maybe_mask_result(self, result, mask, other, op_name):
def _maybe_mask_result(self, result, mask, other, op_name: str):
"""
Parameters
----------
Expand Down Expand Up @@ -768,7 +763,7 @@ class UInt64Dtype(_IntegerDtype):
__doc__ = _dtype_docstring.format(dtype="uint64")


_dtypes: Dict[str, _IntegerDtype] = {
_dtypes = {
"int8": Int8Dtype(),
"int16": Int16Dtype(),
"int32": Int32Dtype(),
Expand Down
44 changes: 29 additions & 15 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Optional, Type, TypeVar

import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas._typing import Scalar

from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
from pandas.core.dtypes.missing import isna, notna
Expand All @@ -12,7 +13,10 @@
from pandas.core.indexers import check_array_indexer

if TYPE_CHECKING:
from pandas._typing import Scalar
from pandas import Series


BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")


class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
Expand All @@ -22,11 +26,16 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
numpy based
"""

_data: np.ndarray
_mask: np.ndarray

# The value used to fill '_data' to avoid upcasting
_internal_fill_value: "Scalar"
_internal_fill_value: Scalar

def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__init__ needs to be declared in the base class...

pandas\core\arrays\masked.py:56: error: Too many arguments for "BaseMaskedArray"
pandas\core\arrays\masked.py:181: error: Too many arguments for "BaseMaskedArray"
pandas\core\arrays\masked.py:207: error: Too many arguments for "BaseMaskedArray"
pandas\core\arrays\masked.py:207: error: Unexpected keyword argument "copy" for "BaseMaskedArray"
pandas\core\arrays\masked.py:213: error: Too many arguments for "BaseMaskedArray"
pandas\core\arrays\masked.py:213: error: Unexpected keyword argument "copy" for "BaseMaskedArray"

also creating this ensures that the subclasses have the correct signature for the constructor to work with __invert__, _concat_same_type, take and copy from the base class.

we could just use a AbstractMethodError but I think it makes sense to put the shared functionality here.

BooleanArray has checking for values.ndim and mask.ndim. IntegerArray does not. It may make sense to have that check here also if applicable to IntegerArray.

if copy:
values = values.copy()
mask = mask.copy()

self._data = values
self._mask = mask

def __getitem__(self, item):
if is_integer(item):
Expand All @@ -48,12 +57,12 @@ def __iter__(self):
def __len__(self) -> int:
return len(self._data)

def __invert__(self):
def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
return type(self)(~self._data, self._mask)

def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
):
self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default,
) -> np.ndarray:
"""
Convert to a NumPy Array.

Expand Down Expand Up @@ -159,24 +168,29 @@ def _hasna(self) -> bool:
# source code using it..
return self._mask.any()

def isna(self):
def isna(self) -> np.ndarray:
return self._mask

@property
def _na_value(self):
return self.dtype.na_value

@property
def nbytes(self):
def nbytes(self) -> int:
return self._data.nbytes + self._mask.nbytes

@classmethod
def _concat_same_type(cls, to_concat):
def _concat_same_type(cls: Type[BaseMaskedArrayT], to_concat) -> BaseMaskedArrayT:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to add the typevar to avoid...

pandas\core\arrays\integer.py:117: error: Incompatible return value type (got "BaseMaskedArray", expected "IntegerArray")
pandas\core\arrays\boolean.py:122: error: Incompatible return value type (got "BaseMaskedArray", expected "BooleanArray")

we can't use the unbound typevar from pandas._typing here otherwise we get...

pandas\core\arrays\masked.py:183: error: Too many arguments for "object"

since the typevar is needed here, it is also used for the other methods that return type(self)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I said in another issue, I don't have problems with it if there is no way around it, but it needs to be documented then

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is a problem; it's documented in PEP 484

https://www.python.org/dev/peps/pep-0484/#annotating-instance-and-class-methods

data = np.concatenate([x._data for x in to_concat])
mask = np.concatenate([x._mask for x in to_concat])
return cls(data, mask)

def take(self, indexer, allow_fill=False, fill_value=None):
def take(
self: BaseMaskedArrayT,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We otherwise don't type self, or do we?
(self is always the type of the class, no?)

indexer,
allow_fill: bool = False,
fill_value: Optional[Scalar] = None,
) -> BaseMaskedArrayT:
# we always fill with 1 internally
# to avoid upcasting
data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
Expand All @@ -197,13 +211,13 @@ def take(self, indexer, allow_fill=False, fill_value=None):

return type(self)(result, mask, copy=False)

def copy(self):
def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
data, mask = self._data, self._mask
data = data.copy()
mask = mask.copy()
return type(self)(data, mask, copy=False)

def value_counts(self, dropna=True):
def value_counts(self, dropna: bool = True) -> "Series":
"""
Returns a Series containing counts of each unique value.

Expand Down