Skip to content

TYP: core.arrays.numpy_ #31348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 31, 2020
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 71 additions & 36 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numbers
from typing import Union
from typing import Optional, Tuple, Type, TypeVar, Union

import numpy as np
from numpy.lib.mixins import NDArrayOperatorsMixin
Expand Down Expand Up @@ -35,52 +35,68 @@ class PandasDtype(ExtensionDtype):

Parameters
----------
dtype : numpy.dtype
dtype : object
Object to be converted to a NumPy data type object.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see how this is more accurate than just "numpy.dtype", but can we be more specific here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's the types accepted by np.dtype which is more permissive than just another np.dtype. Object is straight out of the numpy docs.

do you want to restrict this as before, or establish the actual types accepted by numpy?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if its straight out of the numpy docs, then sounds fine


See Also
--------
numpy.dtype
"""

_metadata = ("_dtype",)

def __init__(self, dtype):
dtype = np.dtype(dtype)
self._dtype = dtype
self._type = dtype.type
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche does turning this into properties have perf implications we need to worry about

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is for consistency with name that was changed in #31037

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK thanks. That was the perf affect I had in mind, but pre-coffee i had it backwards.

def __init__(self, dtype: object):
self._dtype = np.dtype(dtype)

def __repr__(self) -> str:
return f"PandasDtype({repr(self.name)})"

@property
def numpy_dtype(self):
"""The NumPy dtype this PandasDtype wraps."""
def numpy_dtype(self) -> np.dtype:
"""
The NumPy dtype this PandasDtype wraps.
"""
return self._dtype

@property
def name(self):
def name(self) -> str:
"""
A bit-width name for this data-type.

Un-sized flexible data-type objects do not have this attribute.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can leave this out

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no problem. will remove. can we wrap all numpy dtypes?

"""
return self._dtype.name

@property
def type(self):
return self._type
def type(self) -> Type[np.generic]:
"""
The type object used to instantiate a scalar of this NumPy data-type.
"""
return self._dtype.type

@property
def _is_numeric(self):
def _is_numeric(self) -> bool:
# exclude object, str, unicode, void.
return self.kind in set("biufc")

@property
def _is_boolean(self):
def _is_boolean(self) -> bool:
return self.kind == "b"

@classmethod
def construct_from_string(cls, string):
def construct_from_string(cls, string: str) -> "PandasDtype":
try:
return cls(np.dtype(string))
dtype = np.dtype(string)
except TypeError as err:
raise TypeError(
f"Cannot construct a 'PandasDtype' from '{string}'"
) from err
if not isinstance(string, str):
msg = f"'construct_from_string' expects a string, got {type(string)}"
else:
msg = f"Cannot construct a 'PandasDtype' from '{string}'"
raise TypeError(msg) from err
return cls(dtype)

@classmethod
def construct_array_type(cls):
def construct_array_type(cls) -> Type["PandasArray"]:
"""
Return the array type associated with this dtype.

Expand All @@ -91,15 +107,23 @@ def construct_array_type(cls):
return PandasArray

@property
def kind(self):
def kind(self) -> str:
"""
A character code (one of 'biufcmMOSUV') identifying the general kind of data.
"""
return self._dtype.kind

@property
def itemsize(self):
"""The element size of this data-type object."""
def itemsize(self) -> int:
"""
The element size of this data-type object.
"""
return self._dtype.itemsize


PandasArrayT = TypeVar("PandasArrayT", bound="PandasArray")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L166 is type self, so needed for constructor signature to ensure values is type self.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we use the name as a string in other places?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed for now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, I didn't want to imply you necessarily needed to remove it, rather I want to understand it (eg what is the difference with using the class name as a string as we do elsewhere)

To me, this typing is adding a lot of black magic to the codebase, and I want to ensure it stays as understandable as possible for people (including me) that are not so familiar with typing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

an example for the case where the return type is from a base class method

class Base:
    def foo(self) -> "Base":
        return type(self)()


class Derived(Base):
    x = 0

    def bar(self) -> int:
        return self.foo().x


print(Derived().bar())

This is valid python...

$ python pandas/test.py
0

however, mypy will report errors...

$ mypy pandas/test.py
pandas\test.py:10: error: "Base" has no attribute "x"
Found 1 error in 1 file (checked 1 source file)

whereas using a typevar...

from typing import TypeVar

_T = TypeVar("_T")


class Base:
    def foo(self: _T) -> _T:
        return type(self)()


class Derived(Base):
    x = 0

    def bar(self) -> int:
        return self.foo().x


print(Derived().bar())

gives

$ mypy pandas/test.py
Success: no issues found in 1 source file

so typing the base class needs to use typevars to keep mypy green when we add types to the subclasses. I have reverted, but will probably need to add back when the subclasses get merged.

I've not included an example for the init case, but similar principal

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, that explanation helps (but if we start adding it, it needs to be explained in our contributing guidelines)

But: PandasArray is not a base class, it is not further subclassed I think. So why is it needed here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, actually, StringArray now subclasses PandasArray, so forget the last sentence of what I just said ;)



class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin):
"""
A pandas ExtensionArray for NumPy data.
Expand Down Expand Up @@ -136,7 +160,9 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin):
# ------------------------------------------------------------------------
# Constructors

def __init__(self, values: Union[np.ndarray, "PandasArray"], copy: bool = False):
def __init__(
self: PandasArrayT, values: Union[np.ndarray, PandasArrayT], copy: bool = False
):
if isinstance(values, type(self)):
values = values._ndarray
if not isinstance(values, np.ndarray):
Expand All @@ -154,7 +180,9 @@ def __init__(self, values: Union[np.ndarray, "PandasArray"], copy: bool = False)
self._dtype = PandasDtype(values.dtype)

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
def _from_sequence(
cls: Type[PandasArrayT], scalars, dtype=None, copy: bool = False
) -> PandasArrayT:
if isinstance(dtype, PandasDtype):
dtype = dtype._dtype

Expand All @@ -164,18 +192,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls(result)

@classmethod
def _from_factorized(cls, values, original):
def _from_factorized(cls: Type[PandasArrayT], values, original) -> PandasArrayT:
return cls(values)

@classmethod
def _concat_same_type(cls, to_concat):
def _concat_same_type(cls: Type[PandasArrayT], to_concat) -> PandasArrayT:
return cls(np.concatenate(to_concat))

# ------------------------------------------------------------------------
# Data

@property
def dtype(self):
def dtype(self) -> PandasDtype:
return self._dtype

# ------------------------------------------------------------------------
Expand All @@ -186,7 +214,7 @@ def __array__(self, dtype=None) -> np.ndarray:

_HANDLED_TYPES = (np.ndarray, numbers.Number)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
# Lightly modified version of
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/\
# numpy.lib.mixins.NDArrayOperatorsMixin.html
Expand Down Expand Up @@ -242,7 +270,7 @@ def __getitem__(self, item):
result = type(self)(result)
return result

def __setitem__(self, key, value):
def __setitem__(self, key, value) -> None:
value = extract_array(value, extract_numpy=True)

scalar_key = lib.is_scalar(key)
Expand All @@ -263,10 +291,15 @@ def __len__(self) -> int:
def nbytes(self) -> int:
return self._ndarray.nbytes

def isna(self):
def isna(self) -> np.ndarray:
return isna(self._ndarray)

def fillna(self, value=None, method=None, limit=None):
def fillna(
self: PandasArrayT,
value=None,
method: Optional[str] = None,
limit: Optional[int] = None,
) -> PandasArrayT:
# TODO(_values_for_fillna): remove this
value, method = validate_fillna_kwargs(value, method)

Expand All @@ -293,7 +326,9 @@ def fillna(self, value=None, method=None, limit=None):
new_values = self.copy()
return new_values

def take(self, indices, allow_fill=False, fill_value=None):
def take(
self: PandasArrayT, indices, allow_fill=False, fill_value=None
) -> PandasArrayT:
if fill_value is None:
# Primarily for subclasses
fill_value = self.dtype.na_value
Expand All @@ -302,16 +337,16 @@ def take(self, indices, allow_fill=False, fill_value=None):
)
return type(self)(result)

def copy(self):
def copy(self: PandasArrayT) -> PandasArrayT:
return type(self)(self._ndarray.copy())

def _values_for_argsort(self):
def _values_for_argsort(self) -> np.ndarray:
return self._ndarray

def _values_for_factorize(self):
def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
return self._ndarray, -1

def unique(self):
def unique(self: PandasArrayT) -> PandasArrayT:
return type(self)(unique(self._ndarray))

# ------------------------------------------------------------------------
Expand Down