Skip to content

REF: Separate values-casting from Block.astype #38455

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 59 additions & 61 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas._libs.internals import BlockPlacement
from pandas._libs.tslibs import conversion
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import ArrayLike, Scalar, Shape
from pandas._typing import ArrayLike, DtypeObj, Scalar, Shape
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -68,7 +68,7 @@
)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.construction import array as pd_array, extract_array
from pandas.core.indexers import (
check_setitem_lengths,
is_empty_indexer,
Expand Down Expand Up @@ -593,7 +593,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
dtype : str, dtype convertible
copy : bool, default False
copy if indicated
errors : str, {'raise', 'ignore'}, default 'ignore'
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object

Expand All @@ -617,69 +617,23 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
)
raise TypeError(msg)

if dtype is not None:
dtype = pandas_dtype(dtype)

# may need to convert to categorical
if is_categorical_dtype(dtype):

if is_categorical_dtype(self.values.dtype):
# GH 10696/18593: update an existing categorical efficiently
return self.make_block(self.values.astype(dtype, copy=copy))

return self.make_block(Categorical(self.values, dtype=dtype))

dtype = pandas_dtype(dtype)

# astype processing
if is_dtype_equal(self.dtype, dtype):
if copy:
return self.copy()
return self

# force the copy here
if self.is_extension:
try:
values = self.values.astype(dtype)
except (ValueError, TypeError):
if errors == "ignore":
values = self.values
else:
raise
else:
if issubclass(dtype.type, str):

# use native type formatting for datetime/tz/timedelta
if self.is_datelike:
values = self.to_native_types().values

# astype formatting
else:
# Because we have neither is_extension nor is_datelike,
# self.values already has the correct shape
values = self.values

try:
new_values = self._astype(dtype, copy=copy)
except (ValueError, TypeError):
# e.g. astype_nansafe can fail on object-dtype of strings
# trying to convert to float
if errors == "ignore":
new_values = self.values
else:
values = self.get_values(dtype=dtype)

# _astype_nansafe works fine with 1-d only
vals1d = values.ravel()
try:
values = astype_nansafe(vals1d, dtype, copy=True)
except (ValueError, TypeError):
# e.g. astype_nansafe can fail on object-dtype of strings
# trying to convert to float
if errors == "raise":
raise
newb = self.copy() if copy else self
return newb

# TODO(EA2D): special case not needed with 2D EAs
if isinstance(values, np.ndarray):
values = values.reshape(self.shape)
raise

newb = self.make_block(values)
if isinstance(new_values, np.ndarray):
# TODO(EA2D): special case not needed with 2D EAs
new_values = new_values.reshape(self.shape)

newb = self.make_block(new_values)
if newb.is_numeric and self.is_numeric:
if newb.shape != self.shape:
raise TypeError(
Expand All @@ -689,6 +643,50 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
)
return newb

def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike:
values = self.values

if is_categorical_dtype(dtype):

if is_categorical_dtype(values.dtype):
# GH#10696/GH#18593: update an existing categorical efficiently
return values.astype(dtype, copy=copy)

return Categorical(values, dtype=dtype)

if is_dtype_equal(values.dtype, dtype):
if copy:
return values.copy()
return values

if isinstance(values, ExtensionArray):
values = values.astype(dtype, copy=copy)

else:
if issubclass(dtype.type, str):
if values.dtype.kind in ["m", "M"]:
# use native type formatting for datetime/tz/timedelta
arr = pd_array(values)
# Note: in the case where dtype is an np.dtype, i.e. not
# StringDtype, this matches arr.astype(dtype), xref GH#36153
values = arr._format_native_types(na_rep="NaT")

elif is_object_dtype(dtype):
if values.dtype.kind in ["m", "M"]:
# Wrap in Timedelta/Timestamp
arr = pd_array(values)
values = arr.astype(object)
else:
values = values.astype(object)
# We still need to go through astype_nansafe for
# e.g. dtype = Sparse[object, 0]

# astype_nansafe works with 1-d only
vals1d = values.ravel()
values = astype_nansafe(vals1d, dtype, copy=True)

return values

def convert(
self,
copy: bool = True,
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_dtype_equal

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
Expand Down Expand Up @@ -127,11 +129,14 @@ def test_astype_roundtrip(dtype, request):
mark = pytest.mark.xfail(reason=reason)
request.node.add_marker(mark)

s = pd.Series(pd.date_range("2000", periods=12))
s[0] = None
ser = pd.Series(pd.date_range("2000", periods=12))
ser[0] = None

casted = ser.astype(dtype)
assert is_dtype_equal(casted.dtype, dtype)

result = s.astype(dtype).astype("datetime64[ns]")
tm.assert_series_equal(result, s)
result = casted.astype("datetime64[ns]")
tm.assert_series_equal(result, ser)


def test_add(dtype, request):
Expand Down