Skip to content

REF: share astype code in MaskedArray #38490

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 4 additions & 17 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@

from pandas.core.dtypes.common import (
is_bool_dtype,
is_extension_array_dtype,
is_float,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype
from pandas.core.dtypes.missing import isna

from pandas.core import ops
Expand Down Expand Up @@ -372,34 +371,22 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
if incompatible type with an BooleanDtype, equivalent of same_kind
casting
"""
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)

if isinstance(dtype, BooleanDtype):
values, mask = coerce_to_array(self, copy=copy)
if not copy:
return self
else:
return BooleanArray(values, mask, copy=False)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)
if isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy)

if is_bool_dtype(dtype):
# astype_nansafe converts np.nan to True
if self._hasna:
raise ValueError("cannot convert float NaN to bool")
else:
return self._data.astype(dtype, copy=copy)
if is_extension_array_dtype(dtype) and is_integer_dtype(dtype):
from pandas.core.arrays import IntegerArray

return IntegerArray(
self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False
)
# for integer, error if there are missing values
if is_integer_dtype(dtype) and self._hasna:
raise ValueError("cannot convert NA to integer")

# for float dtype, ensure we use np.nan before casting (numpy cannot
# deal with pd.NA)
na_value = self._na_value
Expand Down
21 changes: 3 additions & 18 deletions pandas/core/arrays/floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,13 @@
is_object_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype
from pandas.core.dtypes.missing import isna

from pandas.core import ops
from pandas.core.ops import invalid_comparison
from pandas.core.tools.numeric import to_numeric

from .masked import BaseMaskedDtype
from .numeric import NumericArray, NumericDtype


Expand Down Expand Up @@ -332,24 +331,10 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
if incompatible type with an FloatingDtype, equivalent of same_kind
casting
"""
from pandas.core.arrays.string_ import StringArray, StringDtype

dtype = pandas_dtype(dtype)

# if the dtype is exactly the same, we can fastpath
if self.dtype == dtype:
# return the same object for copy=False
return self.copy() if copy else self
# if we are astyping to another nullable masked dtype, we can fastpath
if isinstance(dtype, BaseMaskedDtype):
# TODO deal with NaNs
data = self._data.astype(dtype.numpy_dtype, copy=copy)
# mask is copied depending on whether the data was copied, and
# not directly depending on the `copy` keyword
mask = self._mask if data is self._data else self._mask.copy()
return dtype.construct_array_type()(data, mask, copy=False)
elif isinstance(dtype, StringDtype):
return StringArray._from_sequence(self, copy=False)
if isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)

# coerce
if is_float_dtype(dtype):
Expand Down
20 changes: 3 additions & 17 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas.compat.numpy import function as nv
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
Expand Down Expand Up @@ -390,24 +390,10 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
if incompatible type with an IntegerDtype, equivalent of same_kind
casting
"""
from pandas.core.arrays.masked import BaseMaskedDtype
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)

# if the dtype is exactly the same, we can fastpath
if self.dtype == dtype:
# return the same object for copy=False
return self.copy() if copy else self
# if we are astyping to another nullable masked dtype, we can fastpath
if isinstance(dtype, BaseMaskedDtype):
data = self._data.astype(dtype.numpy_dtype, copy=copy)
# mask is copied depending on whether the data was copied, and
# not directly depending on the `copy` keyword
mask = self._mask if data is self._data else self._mask.copy()
return dtype.construct_array_type()(data, mask, copy=False)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)
if isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)

# coerce
if is_float_dtype(dtype):
Expand Down
28 changes: 27 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@
import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas._typing import Scalar
from pandas._typing import ArrayLike, Dtype, Scalar
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly, doc

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import (
is_dtype_equal,
is_integer,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna, notna

Expand Down Expand Up @@ -229,6 +231,30 @@ def to_numpy(
data = self._data.astype(dtype, copy=copy)
return data

def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
dtype = pandas_dtype(dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we know this method is only called from the subclasses, this line is not needed (each of the subclass methods already does it as well. The type annotation also indicates we already have a dtype object)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The type annotation also indicates we already have a dtype object)

heads up "Dtype" includes string; "DtypeObj" means a dtype object

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yes. Can you then update the annotation to DtypeObj? Or actually, to ExtensionDtype? Or will mypy complain about that because the methods in the subclasses have a less strict annotation?

BTW, I would still remove this line


if is_dtype_equal(dtype, self.dtype):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here could in theory actually use if self.dtype == dtype since we know we have an extension dtype, and those should never raise when being compared (avoiding yet another set of checks, not sure if it would ever be significant, though).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im fine with it either way. this branch was broken off from one that was trying to push some astype boilerplate into a decorator so used the is_dtype_equal version

if copy:
return self.copy()
return self

# if we are astyping to another nullable masked dtype, we can fastpath
if isinstance(dtype, BaseMaskedDtype):
# TODO deal with NaNs for FloatingArray case
data = self._data.astype(dtype.numpy_dtype, copy=copy)
# mask is copied depending on whether the data was copied, and
# not directly depending on the `copy` keyword
mask = self._mask if data is self._data else self._mask.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wrote this code myself in the past, but I am not actually sure now this is needed. AFAIK the self._data.astype (numpy's method) will always return a copy unless dtype.numpy_dtype is the exact same type as the data. But that case should already be covered by the if is_dtype_equal(dtype, self.dtype) above.

(now since this is copying existing code, fine to leave this for another issue)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, and i think you're right about the numpy behavior

cls = dtype.construct_array_type()
return cls(data, mask, copy=False)

if isinstance(dtype, ExtensionDtype):
eacls = dtype.construct_array_type()
return eacls._from_sequence(self, dtype=dtype, copy=copy)

raise NotImplementedError("subclass must implement astype to np.dtype")

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us

def __array__(self, dtype=None) -> np.ndarray:
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_dtype_equal,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
Expand Down Expand Up @@ -285,10 +286,12 @@ def __setitem__(self, key, value):

def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if isinstance(dtype, StringDtype):

if is_dtype_equal(dtype, self.dtype):
if copy:
return self.copy()
return self

elif isinstance(dtype, _IntegerDtype):
arr = self._ndarray.copy()
mask = self.isna()
Expand Down