Skip to content

BUG: Fix #13149 and ENH: 'copy' param in Index.astype() #13209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ Other enhancements

- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)

- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which has an effect if requirements on dtype are satisfied (:issue:`13209`)

- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)

.. _whatsnew_0182.api:

API changes
Expand Down Expand Up @@ -143,6 +147,9 @@ This will now convert integers/floats with the default unit of ``ns``.
Other API changes
^^^^^^^^^^^^^^^^^

- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`)
- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`)

.. _whatsnew_0182.deprecations:

Deprecations
Expand Down
21 changes: 18 additions & 3 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,11 +336,26 @@ def copy(self):
categories=self.categories, ordered=self.ordered,
fastpath=True)

def astype(self, dtype):
""" coerce this type to another dtype """
def astype(self, dtype, copy=True):
"""
Coerce this type to another dtype

Parameters
----------
dtype : numpy dtype or pandas type
copy : bool, default True
By default, astype always returns a newly allocated object.
If copy is set to False and dtype is categorical, the original
object is returned.

.. versionadded:: 0.18.2

"""
if is_categorical_dtype(dtype):
if copy is True:
return self.copy()
return self
return np.array(self, dtype=dtype)
return np.array(self, dtype=dtype, copy=copy)

@cache_readonly
def ndim(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1600,7 +1600,7 @@ def is_timedelta64_dtype(arr_or_dtype):


def is_timedelta64_ns_dtype(arr_or_dtype):
tipo = _get_dtype_type(arr_or_dtype)
tipo = _get_dtype(arr_or_dtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, to be consistent this should not change (as it will then be inconsisten with everything else). what is causing the issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If Y is an object returned by _get_dtype(X) then _get_dtype_type(X) returns Y.type if I understand correctly. And as _get_dtype_type never returns _TD_DTYPE, is_timedelta64_ns_dtype will always return False.

>>> pd.core.common._TD_DTYPE
Out[33]: dtype('<m8[ns]')
>>> pd.core.common._get_dtype_type(pd.core.common._TD_DTYPE)
Out[34]: numpy.timedelta64
>>> pd.core.common._get_dtype(pd.core.common._TD_DTYPE)
Out[35]: dtype('<m8[ns]')
>>> pd.core.common._TD_DTYPE == np.timedelta64
Out[36]: False
>>> pd.core.common.is_timedelta64_dtype('timedelta64[ns]')
Out[37]: True
>>> pd.core.common.is_timedelta64_ns_dtype('timedelta64[ns]')
Out[38]: False

Compare it with the definitions of is_datetime64...:

def is_datetime64_dtype(arr_or_dtype):
    try:
        tipo = _get_dtype_type(arr_or_dtype)
    except TypeError:
        return False
    return issubclass(tipo, np.datetime64)

def is_datetime64_ns_dtype(arr_or_dtype):
    try:
        tipo = _get_dtype(arr_or_dtype)
    except TypeError:
        return False
    return tipo == _NS_DTYPE

def is_timedelta64_dtype(arr_or_dtype):
    tipo = _get_dtype_type(arr_or_dtype)
    return issubclass(tipo, np.timedelta64)

def is_timedelta64_ns_dtype(arr_or_dtype):
    tipo = _get_dtype_type(arr_or_dtype)
    return tipo == _TD_DTYPE

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh I c, you are right. ok to fix then and pls update the tests in test_common.py

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I also add try and except to both functions?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This protects if we pass a dtype that numpy can't handle e.g. datetime64[ns, tz]
so prob not necessary (as we would want it to blow up in that case as its obviously an error), for the datetimes its really a convenience thing

return tipo == _TD_DTYPE


Expand Down
6 changes: 3 additions & 3 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ def _convert_to_array(self, values, name=None, other=None):
values = tslib.array_to_datetime(values)
elif inferred_type in ('timedelta', 'timedelta64'):
# have a timedelta, convert to to ns here
values = to_timedelta(values, errors='coerce')
values = to_timedelta(values, errors='coerce', box=False)
elif inferred_type == 'integer':
# py3 compat where dtype is 'm' but is an integer
if values.dtype.kind == 'm':
Expand Down Expand Up @@ -504,9 +504,9 @@ def _offset(lvalues, rvalues):

# convert Tick DateOffset to underlying delta
if self.is_offset_lhs:
lvalues = to_timedelta(lvalues)
lvalues = to_timedelta(lvalues, box=False)
if self.is_offset_rhs:
rvalues = to_timedelta(rvalues)
rvalues = to_timedelta(rvalues, box=False)

lvalues = lvalues.astype(np.int64)
if not self.is_floating_rhs:
Expand Down
24 changes: 22 additions & 2 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,8 +754,28 @@ def _to_embed(self, keep_tz=False):
"""
return self.values.copy()

def astype(self, dtype):
return Index(self.values.astype(dtype), name=self.name, dtype=dtype)
_index_shared_docs['astype'] = """
Create an Index with values cast to dtypes. The class of a new Index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good use here of shared docs!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Learning fast :)

is determined by dtype. When conversion is impossible, a ValueError
exception is raised.

Parameters
----------
dtype : numpy dtype or pandas type
copy : bool, default True
By default, astype always returns a newly allocated object.
If copy is set to False and internal requirements on dtype are
satisfied, the original data is used to create a new Index
or the original Index is returned.

.. versionadded:: 0.18.2

"""

@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
return Index(self.values.astype(dtype, copy=copy), name=self.name,
dtype=dtype)

def _to_safe_for_reshape(self):
""" convert to object if we are a categorical """
Expand Down
7 changes: 5 additions & 2 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2074,11 +2074,14 @@ def difference(self, other):
return MultiIndex.from_tuples(difference, sortorder=0,
names=result_names)

def astype(self, dtype):
@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
if not is_object_dtype(np.dtype(dtype)):
raise TypeError('Setting %s dtype to anything other than object '
'is not supported' % self.__class__)
return self._shallow_copy()
elif copy is True:
return self._shallow_copy()
return self

def _convert_can_do_setop(self, other):
result_names = self.names
Expand Down
15 changes: 10 additions & 5 deletions pandas/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas.index as _index

from pandas import compat
from pandas.indexes.base import Index, InvalidIndexError
from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs
from pandas.util.decorators import Appender, cache_readonly
import pandas.core.common as com
from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype,
Expand Down Expand Up @@ -238,12 +238,17 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
def inferred_type(self):
return 'floating'

def astype(self, dtype):
@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if is_float_dtype(dtype) or is_integer_dtype(dtype):
values = self._values.astype(dtype)
if is_float_dtype(dtype):
values = self._values.astype(dtype, copy=copy)
elif is_integer_dtype(dtype):
if self.hasnans:
raise ValueError('cannot convert float NaN to integer')
values = self._values.astype(dtype, copy=copy)
elif is_object_dtype(dtype):
values = self._values
values = self._values.astype('object', copy=copy)
else:
raise TypeError('Setting %s dtype to anything other than '
'float64 or object is not supported' %
Expand Down
207 changes: 204 additions & 3 deletions pandas/tests/indexes/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import numpy as np

from pandas import (date_range, period_range,
Series, Index, DatetimeIndex,
TimedeltaIndex, PeriodIndex)
from pandas import (DatetimeIndex, Float64Index, Index, Int64Index,
NaT, Period, PeriodIndex, Series, Timedelta,
TimedeltaIndex, date_range, period_range,
timedelta_range)

import pandas.util.testing as tm

Expand Down Expand Up @@ -849,3 +850,203 @@ def test_fillna_timedelta(self):
exp = pd.Index(
[pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object)
self.assert_index_equal(idx.fillna('x'), exp)


class TestAstype(tm.TestCase):

def test_DatetimeIndex_astype(self):
# GH 13149, GH 13209
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])

result = idx.astype(object)
expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object)
tm.assert_index_equal(result, expected)

result = idx.astype(int)
expected = Int64Index([1463356800000000000] +
[-9223372036854775808] * 3, dtype=np.int64)
tm.assert_index_equal(result, expected)

def test_DatetimeIndex_astype_str(self):
# GH 13149, GH 13209
# Also: Previously, Python2 returned a unicode representation u'NaT',
# instead of a string, due to a default parameter na_rep=u('NaT') in
# DatetimeIndex._format_native_types(). Consequently, 'result' had
# a mixed inferred type and failed tm.assert_index_equal().

idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
result = idx.astype(str)
expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object)
tm.assert_index_equal(result, expected)

def test_DatetimeIndex_astype_datetime64(self):
# GH 13149, GH 13209
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])

result = idx.astype('datetime64[ns]')
tm.assert_index_equal(result, idx)
self.assertFalse(result is idx)

result = idx.astype('datetime64[ns]', copy=False)
tm.assert_index_equal(result, idx)
self.assertTrue(result is idx)

idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST')
result = idx_tz.astype('datetime64[ns]')
expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'],
dtype='datetime64[ns]')
tm.assert_index_equal(result, expected)

def test_DatetimeIndex_astype_raises(self):
# GH 13149, GH 13209
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])

self.assertRaises(ValueError, idx.astype, float)
self.assertRaises(ValueError, idx.astype, 'timedelta64')
self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
self.assertRaises(ValueError, idx.astype, 'datetime64')
self.assertRaises(ValueError, idx.astype, 'datetime64[D]')

def test_date_range(self):
rng = date_range('1/1/2000', periods=10)

result = rng.astype('i8')
self.assert_numpy_array_equal(result, rng.asi8)

# with tz
rng = date_range('1/1/2000', periods=10, tz='US/Eastern')
result = rng.astype('datetime64[ns]')
expected = (date_range('1/1/2000', periods=10,
tz='US/Eastern')
.tz_convert('UTC').tz_localize(None))
tm.assert_index_equal(result, expected)

# BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex
result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str)
expected = pd.Series(
['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object)
tm.assert_series_equal(result, expected)

result = Series(pd.date_range('2012-01-01', periods=3,
tz='US/Eastern')).astype(str)
expected = Series(['2012-01-01 00:00:00-05:00',
'2012-01-02 00:00:00-05:00',
'2012-01-03 00:00:00-05:00'],
dtype=object)
tm.assert_series_equal(result, expected)

def test_DatetimeIndexOps_astype_str(self):
# test astype string - #10442
result = date_range('2012-01-01', periods=4,
name='test_name').astype(str)
expected = Index(['2012-01-01', '2012-01-02', '2012-01-03',
'2012-01-04'], name='test_name', dtype=object)
tm.assert_index_equal(result, expected)

# test astype string with tz and name
result = date_range('2012-01-01', periods=3, name='test_name',
tz='US/Eastern').astype(str)
expected = Index(['2012-01-01 00:00:00-05:00',
'2012-01-02 00:00:00-05:00',
'2012-01-03 00:00:00-05:00'],
name='test_name', dtype=object)
tm.assert_index_equal(result, expected)

# test astype string with freqH and name
result = date_range('1/1/2011', periods=3, freq='H',
name='test_name').astype(str)
expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
'2011-01-01 02:00:00'],
name='test_name', dtype=object)
tm.assert_index_equal(result, expected)

# test astype string with freqH and timezone
result = date_range('3/6/2012 00:00', periods=2, freq='H',
tz='Europe/London', name='test_name').astype(str)
expected = Index(['2012-03-06 00:00:00+00:00',
'2012-03-06 01:00:00+00:00'],
dtype=object, name='test_name')
tm.assert_index_equal(result, expected)

def test_TimedeltaIndex_astype(self):
# GH 13149, GH 13209
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])

result = idx.astype(object)
expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3,
dtype=object)
tm.assert_index_equal(result, expected)

result = idx.astype(int)
expected = Int64Index([100000000000000] + [-9223372036854775808] * 3,
dtype=np.int64)
tm.assert_index_equal(result, expected)

def test_TimedeltaIndex_astype_timedelta64(self):
# GH 13149, GH 13209
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])

result = idx.astype('timedelta64')
expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64')
tm.assert_index_equal(result, expected)

result = idx.astype('timedelta64[ns]')
tm.assert_index_equal(result, idx)
self.assertFalse(result is idx)

result = idx.astype('timedelta64[ns]', copy=False)
tm.assert_index_equal(result, idx)
self.assertTrue(result is idx)

def test_TimedeltaIndex_astype_raises(self):
# GH 13149, GH 13209
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])

self.assertRaises(ValueError, idx.astype, float)
self.assertRaises(ValueError, idx.astype, str)
self.assertRaises(ValueError, idx.astype, 'datetime64')
self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')

def test_timedelta_range(self):
rng = timedelta_range('1 days', periods=10)

result = rng.astype('i8')
self.assert_numpy_array_equal(result, rng.asi8)

def test_PeriodIndex(self):
# GH 13149, GH 13209
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')

result = idx.astype(object)
expected = Index([Period('2016-05-16', freq='D')] +
[Period(NaT, freq='D')] * 3, dtype='object')
# Hack because of lack of support for Period null checking (GH12759)
tm.assert_index_equal(result[:1], expected[:1])
result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64)
expected_arr = np.asarray([p.ordinal for p in expected],
dtype=np.int64)
tm.assert_numpy_array_equal(result_arr, expected_arr)
# TODO: When GH12759 is resolved, change the above hack to:
# tm.assert_index_equal(result, expected) # now, it raises.

result = idx.astype(int)
expected = Int64Index([16937] + [-9223372036854775808] * 3,
dtype=np.int64)
tm.assert_index_equal(result, expected)

def test_PeriodIndex_raises(self):
# GH 13149, GH 13209
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')

self.assertRaises(ValueError, idx.astype, str)
self.assertRaises(ValueError, idx.astype, float)
self.assertRaises(ValueError, idx.astype, 'timedelta64')
self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
self.assertRaises(ValueError, idx.astype, 'datetime64')
self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')

def test_period_range(self):
idx = period_range('1990', '2009', freq='A')

result = idx.astype('i8')
self.assert_numpy_array_equal(result, idx.values)
5 changes: 5 additions & 0 deletions pandas/tests/indexes/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ def test_astype(self):
for dtype in ['M8[ns]', 'm8[ns]']:
self.assertRaises(TypeError, lambda: i.astype(dtype))

# GH 13149
for dtype in ['int16', 'int32', 'int64']:
i = Float64Index([0, 1.1, np.NAN])
self.assertRaises(ValueError, lambda: i.astype(dtype))

def test_equals(self):

i = Float64Index([1.0, 2.0])
Expand Down
Loading