Skip to content

implement astype portion of #24024 #24405

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Dec 28, 2018
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
6a5c216
implement astype portion of #24024
jbrockmendel Dec 24, 2018
1a9f30b
fixup unused import
jbrockmendel Dec 24, 2018
1b109b8
isort fixup
jbrockmendel Dec 24, 2018
f271005
Merge branch 'master' of https://github.com/pandas-dev/pandas into le…
jbrockmendel Dec 24, 2018
5615b9f
pass copy kwarg
jbrockmendel Dec 24, 2018
d5cca5a
Merge branch 'master' of https://github.com/pandas-dev/pandas into le…
jbrockmendel Dec 25, 2018
184f59f
revert change that brokethe world
jbrockmendel Dec 25, 2018
df39bd7
Merge branch 'master' of https://github.com/pandas-dev/pandas into le…
jbrockmendel Dec 25, 2018
e41068a
comments, typo
jbrockmendel Dec 25, 2018
6f108dd
avoid double-copy
jbrockmendel Dec 25, 2018
b123d08
Merge branch 'master' of https://github.com/pandas-dev/pandas into le…
jbrockmendel Dec 28, 2018
207ffb9
Merge branch 'master' of https://github.com/pandas-dev/pandas into le…
jbrockmendel Dec 28, 2018
04efd45
sidestep int sign/size astype issues
jbrockmendel Dec 28, 2018
3fca810
Implement UInt64 handling, tests, and docs
TomAugspurger Dec 28, 2018
5fa32e9
Handle uint in astype tests
TomAugspurger Dec 28, 2018
5d718e6
Fixed TimedeltaArray._format_native_types
TomAugspurger Dec 28, 2018
33b5434
Linting
TomAugspurger Dec 28, 2018
e29d898
Change default to str
TomAugspurger Dec 28, 2018
a3c42f0
revert for period
TomAugspurger Dec 28, 2018
eac662b
Merge remote-tracking branch 'upstream/master' into jbrockmendel-less…
TomAugspurger Dec 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@
from pandas.util._decorators import Appender, Substitution, deprecate_kwarg

from pandas.core.dtypes.common import (
is_bool_dtype, is_datetime64_any_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_extension_array_dtype, is_float_dtype,
is_integer_dtype, is_list_like, is_object_dtype, is_offsetlike,
is_period_dtype, is_timedelta64_dtype, needs_i8_conversion)
is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype,
is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype,
is_list_like, is_object_dtype, is_offsetlike, is_period_dtype,
is_string_dtype, is_timedelta64_dtype, needs_i8_conversion, pandas_dtype)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -403,9 +404,49 @@ def __getitem__(self, key):
return self._simple_new(result, **attribs)

def astype(self, dtype, copy=True):
# Some notes on cases we don't have to handle here in the base class:
# 1. PeriodArray.astype handles period -> period
# 2. DatetimeArray.astype handles conversion between tz.
# 3. DatetimeArray.astype handles datetime -> period
from pandas import Categorical
dtype = pandas_dtype(dtype)

if is_object_dtype(dtype):
return self._box_values(self.asi8)
return super(DatetimeLikeArrayMixin, self).astype(dtype, copy)
elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
return self._format_native_types()
elif is_integer_dtype(dtype):
# we deliberately ignore int32 vs. int64 here.
# See https://github.com/pandas-dev/pandas/issues/24381 for more.
values = self.asi8
if copy:
values = values.copy()
return values
elif (is_datetime_or_timedelta_dtype(dtype) and
not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
# disallow conversion between datetime/timedelta,
# and conversions for any datetimelike to float
msg = 'Cannot cast {name} to dtype {dtype}'
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
elif is_categorical_dtype(dtype):
return Categorical(self, dtype=dtype)
else:
return np.asarray(self, dtype=dtype)

def view(self, dtype=None):
"""
New view on this array with the same data.

Parameters
----------
dtype : numpy dtype, optional

Returns
-------
ndarray
With the specified `dtype`.
"""
return self._data.view(dtype=dtype)

# ------------------------------------------------------------------
# ExtensionArray Interface
Expand Down
34 changes: 32 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@

from pandas.core.dtypes.common import (
_INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_extension_type, is_float_dtype, is_int64_dtype,
is_object_dtype, is_period_dtype, is_string_dtype, is_timedelta64_dtype)
is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal,
is_extension_type, is_float_dtype, is_int64_dtype, is_object_dtype,
is_period_dtype, is_string_dtype, is_timedelta64_dtype, pandas_dtype)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -469,6 +470,35 @@ def __iter__(self):
for v in converted:
yield v

def astype(self, dtype, copy=True):
# We handle
# --> datetime
# --> period
# DatetimeLikeArrayMixin Super handles the rest.
dtype = pandas_dtype(dtype)

if (is_datetime64_ns_dtype(dtype) and
not is_dtype_equal(dtype, self.dtype)):
# GH#18951: datetime64_ns dtype but not equal means different tz
new_tz = getattr(dtype, 'tz', None)
if getattr(self.dtype, 'tz', None) is None:
return self.tz_localize(new_tz)
result = self.tz_convert(new_tz)
if new_tz is None:
# Do we want .astype('datetime64[ns]') to be an ndarray.
# The astype in Block._astype expects this to return an
# ndarray, but we could maybe work around it there.
result = result._data
return result
elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype,
dtype):
if copy:
return self.copy()
return self
elif is_period_dtype(dtype):
return self.to_period(freq=dtype.freq)
return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noticed... it'd be nice to leave a bunch of TODO: Use super for places like this.

Actually... I think Python2 will force us to make this changes when we switch inheritance to composition, since we won't be able to call the unbound method with a DatetimeIndex anymore (I think).


# ----------------------------------------------------------------
# ExtensionArray Interface

Expand Down
43 changes: 6 additions & 37 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.common import (
_TD_DTYPE, ensure_object, is_array_like, is_categorical_dtype,
is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal,
is_float_dtype, is_integer_dtype, is_list_like, is_object_dtype,
is_period_dtype, is_string_dtype, pandas_dtype)
_TD_DTYPE, ensure_object, is_array_like, is_datetime64_dtype,
is_float_dtype, is_list_like, is_period_dtype, pandas_dtype)
from pandas.core.dtypes.dtypes import PeriodDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodIndex, ABCSeries
from pandas.core.dtypes.missing import isna, notna
Expand Down Expand Up @@ -593,42 +591,13 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs):
# ------------------------------------------------------------------

def astype(self, dtype, copy=True):
# TODO: Figure out something better here...
# We have DatetimeLikeArrayMixin ->
# super(...), which ends up being... DatetimeIndexOpsMixin?
# this is complicated.
# need a pandas_astype(arr, dtype).
from pandas import Categorical

# We handle Period[T] -> Period[U]
# Our parent handles everything else.
dtype = pandas_dtype(dtype)

if is_object_dtype(dtype):
return np.asarray(self, dtype=object)
elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
return self._format_native_types()
elif is_integer_dtype(dtype):
values = self._data

if values.dtype != dtype:
# int32 vs. int64
values = values.astype(dtype)

elif copy:
values = values.copy()

return values
elif (is_datetime_or_timedelta_dtype(dtype) and
not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
# disallow conversion between datetime/timedelta,
# and conversions for any datetimelike to float
msg = 'Cannot cast {name} to dtype {dtype}'
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
elif is_categorical_dtype(dtype):
return Categorical(self, dtype=dtype)
elif is_period_dtype(dtype):
if is_period_dtype(dtype):
return self.asfreq(dtype.freq)
else:
return np.asarray(self, dtype=dtype)
return super(PeriodArray, self).astype(dtype, copy=copy)

@property
def flags(self):
Expand Down
29 changes: 28 additions & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from pandas.core.dtypes.common import (
_NS_DTYPE, _TD_DTYPE, ensure_int64, is_datetime64_dtype, is_float_dtype,
is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
is_string_dtype, is_timedelta64_dtype)
is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype,
pandas_dtype)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCIndexClass, ABCSeries, ABCTimedeltaIndex)
Expand Down Expand Up @@ -231,6 +232,32 @@ def _validate_fill_value(self, fill_value):
"Got '{got}'.".format(got=fill_value))
return fill_value

def astype(self, dtype, copy=True):
# We handle
# --> timedelta64[ns]
# --> timedelta64
# DatetimeLikeArrayMixin super call handles other cases
dtype = pandas_dtype(dtype)

if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype):
# by pandas convention, converting to non-nano timedelta64
# returns an int64-dtyped array with ints representing multiples
# of the desired timedelta unit. This is essentially division
if self._hasnans:
# avoid double-copying
result = self._data.astype(dtype, copy=False)
values = self._maybe_mask_results(result,
fill_value=None,
convert='float64')
return values
result = self._data.astype(dtype, copy=copy)
return result.astype('i8')
elif is_timedelta64_ns_dtype(dtype):
if copy:
return self.copy()
return self
return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)

# ----------------------------------------------------------------
# Rendering Methods

Expand Down
6 changes: 4 additions & 2 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
is_period_dtype, is_scalar, is_string_dtype, is_string_like_dtype,
is_timedelta64_dtype, needs_i8_conversion, pandas_dtype)
from .generic import (
ABCExtensionArray, ABCGeneric, ABCIndexClass, ABCMultiIndex, ABCSeries)
ABCDatetimeArray, ABCExtensionArray, ABCGeneric, ABCIndexClass,
ABCMultiIndex, ABCSeries, ABCTimedeltaArray)
from .inference import is_list_like

isposinf_scalar = libmissing.isposinf_scalar
Expand Down Expand Up @@ -108,7 +109,8 @@ def _isna_new(obj):
elif isinstance(obj, ABCMultiIndex):
raise NotImplementedError("isna is not defined for MultiIndex")
elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
ABCExtensionArray)):
ABCExtensionArray,
ABCDatetimeArray, ABCTimedeltaArray)):
return _isna_ndarraylike(obj)
elif isinstance(obj, ABCGeneric):
return obj._constructor(obj._data.isna(func=isna))
Expand Down
42 changes: 21 additions & 21 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
from pandas.util._decorators import Appender, cache_readonly

from pandas.core.dtypes.common import (
ensure_int64, is_bool_dtype, is_categorical_dtype,
is_datetime_or_timedelta_dtype, is_dtype_equal, is_float, is_float_dtype,
is_integer, is_integer_dtype, is_list_like, is_object_dtype,
is_period_dtype, is_scalar, is_string_dtype)
ensure_int64, is_bool_dtype, is_dtype_equal, is_float, is_integer,
is_integer_dtype, is_list_like, is_period_dtype, is_scalar, pandas_dtype)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries

from pandas.core import algorithms, ops
Expand All @@ -40,6 +38,7 @@ class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin):
# override DatetimeLikeArrayMixin method
copy = Index.copy
unique = Index.unique
view = Index.view

# DatetimeLikeArrayMixin assumes subclasses are mutable, so these are
# properties there. They can be made into cache_readonly for Index
Expand Down Expand Up @@ -527,24 +526,25 @@ def _maybe_box_as_values(self, values, **attribs):
# - sort_values
return values

@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
if is_object_dtype(dtype):
return self._box_values_as_index()
elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
return Index(self.format(), name=self.name, dtype=object)
elif is_integer_dtype(dtype):
# TODO(DatetimeArray): use self._values here.
# Can't use ._values currently, because that returns a
# DatetimeIndex, which throws us in an infinite loop.
return Index(self.values.astype('i8', copy=copy), name=self.name,
dtype='i8')
elif (is_datetime_or_timedelta_dtype(dtype) and
not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
# disallow conversion between datetime/timedelta,
# and conversions for any datetimelike to float
msg = 'Cannot cast {name} to dtype {dtype}'
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
if is_dtype_equal(self.dtype, dtype) and copy is False:
# Ensure that self.astype(self.dtype) is self
return self

new_values = self._eadata.astype(dtype, copy=copy)

# we pass `dtype` to the Index constructor, for cases like
# dtype=object to disable inference. But, DTA.astype ignores
# integer sign and size, so we need to detect that case and
# just choose int64.
dtype = pandas_dtype(dtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure this is necessary as it already coerces properly, doing it here is very weird.

In [2]: pd.Index([1,2,3],dtype='int32')
Out[2]: Int64Index([1, 2, 3], dtype='int64')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you address this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not in the last 8 hours, no. May need to wait on Tom to clarify, since all of this was taken from 24024.

(the fact that these things get closer attention in smaller doses reassures me that splitting is a good idea, even if it does cause rebasing hassles in the parent PR)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, sounds ok then.

Copy link
Contributor

@TomAugspurger TomAugspurger Dec 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What’s the question here? Why we do the integer check? Astype ignores the sign and size. I suppose the index constructor just ignores the size?

Copy link
Contributor

@TomAugspurger TomAugspurger Dec 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if you saw the part about uint vs. int.

So I'm just going to decide that the expected behavior for {Datetime,Timedelta,Period}Index.astype("uint{8,16,32,64}") is to return a UInt64Index. That means we can remove this check and just pass new_values through with the original dtype.

@jbrockmendel do you want to do that here? It's not at all tested, and will need a release note.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok by me (of course its weird to do this, but hey)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you want to do that here? It's not at all tested, and will need a release note.

I tried this, pretty much just deleting ten lines here, and ended up getting two failures in pandas/tests/indexes/interval/test_astype.py. I can fix this by changing dtype=dtype to dtype=new_values.dtype in the call that wraps self._eadata.astype. Is that what you have in mind?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Attempt #2 at this also failed. Any other ideas?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I missed this note last night. I implemented this in 3fca810 if you could take a look.

if is_integer_dtype(dtype):
dtype = np.dtype("int64")

# pass copy=False because any copying will be done in the
# _eadata.astype call above
return Index(new_values, dtype=dtype, name=self.name, copy=False)

@Appender(DatetimeLikeArrayMixin._time_shift.__doc__)
def _time_shift(self, periods, freq=None):
Expand Down
27 changes: 9 additions & 18 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,16 @@
from pandas.util._decorators import Appender, Substitution, cache_readonly

from pandas.core.dtypes.common import (
_NS_DTYPE, ensure_int64, is_datetime64_ns_dtype, is_dtype_equal, is_float,
is_integer, is_list_like, is_period_dtype, is_scalar, is_string_like,
pandas_dtype)
_NS_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar,
is_string_like)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.missing import isna

from pandas.core.arrays.datetimes import (
DatetimeArrayMixin as DatetimeArray, _to_m8)
from pandas.core.base import _shared_docs
import pandas.core.common as com
from pandas.core.indexes.base import Index, _index_shared_docs
from pandas.core.indexes.base import Index
from pandas.core.indexes.datetimelike import (
DatetimeIndexOpsMixin, wrap_array_method, wrap_field_accessor)
from pandas.core.indexes.numeric import Int64Index
Expand Down Expand Up @@ -603,20 +602,6 @@ def intersection(self, other):

# --------------------------------------------------------------------

@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if (is_datetime64_ns_dtype(dtype) and
not is_dtype_equal(dtype, self.dtype)):
# GH 18951: datetime64_ns dtype but not equal means different tz
new_tz = getattr(dtype, 'tz', None)
if getattr(self.dtype, 'tz', None) is None:
return self.tz_localize(new_tz)
return self.tz_convert(new_tz)
elif is_period_dtype(dtype):
return self.to_period(freq=dtype.freq)
return super(DatetimeIndex, self).astype(dtype, copy=copy)

def _get_time_micros(self):
values = self.asi8
if self.tz is not None and not timezones.is_utc(self.tz):
Expand Down Expand Up @@ -1089,10 +1074,16 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
# --------------------------------------------------------------------
# Wrapping DatetimeArray

@property
def _eadata(self):
return DatetimeArray._simple_new(self._data,
tz=self.tz, freq=self.freq)

# Compat for frequency inference, see GH#23789
_is_monotonic_increasing = Index.is_monotonic_increasing
_is_monotonic_decreasing = Index.is_monotonic_decreasing
_is_unique = Index.is_unique
astype = DatetimeIndexOpsMixin.astype

_timezone = cache_readonly(DatetimeArray._timezone.fget)
is_normalized = cache_readonly(DatetimeArray.is_normalized.fget)
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,10 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs):
# ------------------------------------------------------------------------
# Data

@property
def _eadata(self):
return self._data

@property
def _ndarray_values(self):
return self._data._ndarray_values
Expand Down Expand Up @@ -539,16 +543,13 @@ def asof_locs(self, where, mask):
def astype(self, dtype, copy=True, how='start'):
dtype = pandas_dtype(dtype)

# We have a few special-cases for `dtype`.
# Failing those, we fall back to astyping the values

if is_datetime64_any_dtype(dtype):
# 'how' is index-speicifc, isn't part of the EA interface.
# 'how' is index-specific, isn't part of the EA interface.
tz = getattr(dtype, 'tz', None)
return self.to_timestamp(how=how).tz_localize(tz)

result = self._data.astype(dtype, copy=copy)
return Index(result, name=self.name, dtype=dtype, copy=False)
# TODO: should probably raise on `how` here, so we don't ignore it.
return super(PeriodIndex, self).astype(dtype, copy=copy)

@Substitution(klass='PeriodIndex')
@Appender(_shared_docs['searchsorted'])
Expand Down
Loading