pandas-dev · pijucha · May 17, 2016 · jreback · May 19, 2016 · pijucha
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -49,6 +49,10 @@ Other enhancements
 
 - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
 
+- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which has an effect if requirements on dtype are satisfied (:issue:`13209`)
+
+- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
+
 .. _whatsnew_0182.api:
 
 API changes
@@ -143,6 +147,9 @@ This will now convert integers/floats with the default unit of ``ns``.
 Other API changes
 ^^^^^^^^^^^^^^^^^
 
+- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`)
+- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`)
+
 .. _whatsnew_0182.deprecations:
 
 Deprecations

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -336,11 +336,26 @@ def copy(self):
                            categories=self.categories, ordered=self.ordered,
                            fastpath=True)
 
-    def astype(self, dtype):
-        """ coerce this type to another dtype """
+    def astype(self, dtype, copy=True):
+        """
+        Coerce this type to another dtype
+
+        Parameters
+        ----------
+        dtype : numpy dtype or pandas type
+        copy : bool, default True
+            By default, astype always returns a newly allocated object.
+            If copy is set to False and dtype is categorical, the original
+            object is returned.
+
+            .. versionadded:: 0.18.2
+
+        """
         if is_categorical_dtype(dtype):
+            if copy is True:
+                return self.copy()
             return self
-        return np.array(self, dtype=dtype)
+        return np.array(self, dtype=dtype, copy=copy)
 
     @cache_readonly
     def ndim(self):

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1600,7 +1600,7 @@ def is_timedelta64_dtype(arr_or_dtype):
 
 
 def is_timedelta64_ns_dtype(arr_or_dtype):
-    tipo = _get_dtype_type(arr_or_dtype)
+    tipo = _get_dtype(arr_or_dtype)
     return tipo == _TD_DTYPE
 
 

diff --git a/pandas/core/ops.py b/pandas/core/ops.py
@@ -422,7 +422,7 @@ def _convert_to_array(self, values, name=None, other=None):
                 values = tslib.array_to_datetime(values)
         elif inferred_type in ('timedelta', 'timedelta64'):
             # have a timedelta, convert to to ns here
-            values = to_timedelta(values, errors='coerce')
+            values = to_timedelta(values, errors='coerce', box=False)
         elif inferred_type == 'integer':
             # py3 compat where dtype is 'm' but is an integer
             if values.dtype.kind == 'm':
@@ -504,9 +504,9 @@ def _offset(lvalues, rvalues):
 
             # convert Tick DateOffset to underlying delta
             if self.is_offset_lhs:
-                lvalues = to_timedelta(lvalues)
+                lvalues = to_timedelta(lvalues, box=False)
             if self.is_offset_rhs:
-                rvalues = to_timedelta(rvalues)
+                rvalues = to_timedelta(rvalues, box=False)
 
             lvalues = lvalues.astype(np.int64)
             if not self.is_floating_rhs:

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -754,8 +754,28 @@ def _to_embed(self, keep_tz=False):
         """
         return self.values.copy()
 
-    def astype(self, dtype):
-        return Index(self.values.astype(dtype), name=self.name, dtype=dtype)
+    _index_shared_docs['astype'] = """
+        Create an Index with values cast to dtypes. The class of a new Index
+        is determined by dtype. When conversion is impossible, a ValueError
+        exception is raised.
+
+        Parameters
+        ----------
+        dtype : numpy dtype or pandas type
+        copy : bool, default True
+            By default, astype always returns a newly allocated object.
+            If copy is set to False and internal requirements on dtype are
+            satisfied, the original data is used to create a new Index
+            or the original Index is returned.
+
+            .. versionadded:: 0.18.2
+
+        """
+
+    @Appender(_index_shared_docs['astype'])
+    def astype(self, dtype, copy=True):
+        return Index(self.values.astype(dtype, copy=copy), name=self.name,
+                     dtype=dtype)
 
     def _to_safe_for_reshape(self):
         """ convert to object if we are a categorical """

diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
@@ -2074,11 +2074,14 @@ def difference(self, other):
             return MultiIndex.from_tuples(difference, sortorder=0,
                                           names=result_names)
 
-    def astype(self, dtype):
+    @Appender(_index_shared_docs['astype'])
+    def astype(self, dtype, copy=True):
         if not is_object_dtype(np.dtype(dtype)):
             raise TypeError('Setting %s dtype to anything other than object '
                             'is not supported' % self.__class__)
-        return self._shallow_copy()
+        elif copy is True:
+            return self._shallow_copy()
+        return self
 
     def _convert_can_do_setop(self, other):
         result_names = self.names

diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py
@@ -4,7 +4,7 @@
 import pandas.index as _index
 
 from pandas import compat
-from pandas.indexes.base import Index, InvalidIndexError
+from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs
 from pandas.util.decorators import Appender, cache_readonly
 import pandas.core.common as com
 from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype,
@@ -238,12 +238,17 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
     def inferred_type(self):
         return 'floating'
 
-    def astype(self, dtype):
+    @Appender(_index_shared_docs['astype'])
+    def astype(self, dtype, copy=True):
         dtype = pandas_dtype(dtype)
-        if is_float_dtype(dtype) or is_integer_dtype(dtype):
-            values = self._values.astype(dtype)
+        if is_float_dtype(dtype):
+            values = self._values.astype(dtype, copy=copy)
+        elif is_integer_dtype(dtype):
+            if self.hasnans:
+                raise ValueError('cannot convert float NaN to integer')
+            values = self._values.astype(dtype, copy=copy)
         elif is_object_dtype(dtype):
-            values = self._values
+            values = self._values.astype('object', copy=copy)
         else:
             raise TypeError('Setting %s dtype to anything other than '
                             'float64 or object is not supported' %

diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py
@@ -4,9 +4,10 @@
 
 import numpy as np
 
-from pandas import (date_range, period_range,
-                    Series, Index, DatetimeIndex,
-                    TimedeltaIndex, PeriodIndex)
+from pandas import (DatetimeIndex, Float64Index, Index, Int64Index,
+                    NaT, Period, PeriodIndex, Series, Timedelta,
+                    TimedeltaIndex, date_range, period_range,
+                    timedelta_range)
 
 import pandas.util.testing as tm
 
@@ -849,3 +850,203 @@ def test_fillna_timedelta(self):
         exp = pd.Index(
             [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object)
         self.assert_index_equal(idx.fillna('x'), exp)
+
+
+class TestAstype(tm.TestCase):
+
+    def test_DatetimeIndex_astype(self):
+        # GH 13149, GH 13209
+        idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+
+        result = idx.astype(object)
+        expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        result = idx.astype(int)
+        expected = Int64Index([1463356800000000000] +
+                              [-9223372036854775808] * 3, dtype=np.int64)
+        tm.assert_index_equal(result, expected)
+
+    def test_DatetimeIndex_astype_str(self):
+        # GH 13149, GH 13209
+        # Also: Previously, Python2 returned a unicode representation u'NaT',
+        # instead of a string, due to a default parameter na_rep=u('NaT') in
+        # DatetimeIndex._format_native_types(). Consequently, 'result' had
+        # a mixed inferred type and failed tm.assert_index_equal().
+
+        idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+        result = idx.astype(str)
+        expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object)
+        tm.assert_index_equal(result, expected)
+
+    def test_DatetimeIndex_astype_datetime64(self):
+        # GH 13149, GH 13209
+        idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+
+        result = idx.astype('datetime64[ns]')
+        tm.assert_index_equal(result, idx)
+        self.assertFalse(result is idx)
+
+        result = idx.astype('datetime64[ns]', copy=False)
+        tm.assert_index_equal(result, idx)
+        self.assertTrue(result is idx)
+
+        idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST')
+        result = idx_tz.astype('datetime64[ns]')
+        expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'],
+                                 dtype='datetime64[ns]')
+        tm.assert_index_equal(result, expected)
+
+    def test_DatetimeIndex_astype_raises(self):
+        # GH 13149, GH 13209
+        idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
+
+        self.assertRaises(ValueError, idx.astype, float)
+        self.assertRaises(ValueError, idx.astype, 'timedelta64')
+        self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
+        self.assertRaises(ValueError, idx.astype, 'datetime64')
+        self.assertRaises(ValueError, idx.astype, 'datetime64[D]')
+
+    def test_date_range(self):
+        rng = date_range('1/1/2000', periods=10)
+
+        result = rng.astype('i8')
+        self.assert_numpy_array_equal(result, rng.asi8)
+
+        # with tz
+        rng = date_range('1/1/2000', periods=10, tz='US/Eastern')
+        result = rng.astype('datetime64[ns]')
+        expected = (date_range('1/1/2000', periods=10,
+                               tz='US/Eastern')
+                    .tz_convert('UTC').tz_localize(None))
+        tm.assert_index_equal(result, expected)
+
+        # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex
+        result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str)
+        expected = pd.Series(
+            ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object)
+        tm.assert_series_equal(result, expected)
+
+        result = Series(pd.date_range('2012-01-01', periods=3,
+                                      tz='US/Eastern')).astype(str)
+        expected = Series(['2012-01-01 00:00:00-05:00',
+                           '2012-01-02 00:00:00-05:00',
+                           '2012-01-03 00:00:00-05:00'],
+                          dtype=object)
+        tm.assert_series_equal(result, expected)
+
+    def test_DatetimeIndexOps_astype_str(self):
+        # test astype string - #10442
+        result = date_range('2012-01-01', periods=4,
+                            name='test_name').astype(str)
+        expected = Index(['2012-01-01', '2012-01-02', '2012-01-03',
+                          '2012-01-04'], name='test_name', dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        # test astype string with tz and name
+        result = date_range('2012-01-01', periods=3, name='test_name',
+                            tz='US/Eastern').astype(str)
+        expected = Index(['2012-01-01 00:00:00-05:00',
+                          '2012-01-02 00:00:00-05:00',
+                          '2012-01-03 00:00:00-05:00'],
+                         name='test_name', dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        # test astype string with freqH and name
+        result = date_range('1/1/2011', periods=3, freq='H',
+                            name='test_name').astype(str)
+        expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
+                          '2011-01-01 02:00:00'],
+                         name='test_name', dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        # test astype string with freqH and timezone
+        result = date_range('3/6/2012 00:00', periods=2, freq='H',
+                            tz='Europe/London', name='test_name').astype(str)
+        expected = Index(['2012-03-06 00:00:00+00:00',
+                          '2012-03-06 01:00:00+00:00'],
+                         dtype=object, name='test_name')
+        tm.assert_index_equal(result, expected)
+
+    def test_TimedeltaIndex_astype(self):
+        # GH 13149, GH 13209
+        idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
+
+        result = idx.astype(object)
+        expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3,
+                         dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        result = idx.astype(int)
+        expected = Int64Index([100000000000000] + [-9223372036854775808] * 3,
+                              dtype=np.int64)
+        tm.assert_index_equal(result, expected)
+
+    def test_TimedeltaIndex_astype_timedelta64(self):
+        # GH 13149, GH 13209
+        idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
+
+        result = idx.astype('timedelta64')
+        expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64')
+        tm.assert_index_equal(result, expected)
+
+        result = idx.astype('timedelta64[ns]')
+        tm.assert_index_equal(result, idx)
+        self.assertFalse(result is idx)
+
+        result = idx.astype('timedelta64[ns]', copy=False)
+        tm.assert_index_equal(result, idx)
+        self.assertTrue(result is idx)
+
+    def test_TimedeltaIndex_astype_raises(self):
+        # GH 13149, GH 13209
+        idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
+
+        self.assertRaises(ValueError, idx.astype, float)
+        self.assertRaises(ValueError, idx.astype, str)
+        self.assertRaises(ValueError, idx.astype, 'datetime64')
+        self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')
+
+    def test_timedelta_range(self):
+        rng = timedelta_range('1 days', periods=10)
+
+        result = rng.astype('i8')
+        self.assert_numpy_array_equal(result, rng.asi8)
+
+    def test_PeriodIndex(self):
+        # GH 13149, GH 13209
+        idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
+
+        result = idx.astype(object)
+        expected = Index([Period('2016-05-16', freq='D')] +
+                         [Period(NaT, freq='D')] * 3, dtype='object')
+        # Hack because of lack of support for Period null checking (GH12759)
+        tm.assert_index_equal(result[:1], expected[:1])
+        result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64)
+        expected_arr = np.asarray([p.ordinal for p in expected],
+                                  dtype=np.int64)
+        tm.assert_numpy_array_equal(result_arr, expected_arr)
+        # TODO: When GH12759 is resolved, change the above hack to:
+        # tm.assert_index_equal(result, expected)         # now, it raises.
+
+        result = idx.astype(int)
+        expected = Int64Index([16937] + [-9223372036854775808] * 3,
+                              dtype=np.int64)
+        tm.assert_index_equal(result, expected)
+
+    def test_PeriodIndex_raises(self):
+        # GH 13149, GH 13209
+        idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
+
+        self.assertRaises(ValueError, idx.astype, str)
+        self.assertRaises(ValueError, idx.astype, float)
+        self.assertRaises(ValueError, idx.astype, 'timedelta64')
+        self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
+        self.assertRaises(ValueError, idx.astype, 'datetime64')
+        self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')
+
+    def test_period_range(self):
+        idx = period_range('1990', '2009', freq='A')
+
+        result = idx.astype('i8')
+        self.assert_numpy_array_equal(result, idx.values)
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -259,6 +259,11 @@ def test_astype(self):
         for dtype in ['M8[ns]', 'm8[ns]']:
             self.assertRaises(TypeError, lambda: i.astype(dtype))
 
+        # GH 13149
+        for dtype in ['int16', 'int32', 'int64']:
+            i = Float64Index([0, 1.1, np.NAN])
+            self.assertRaises(ValueError, lambda: i.astype(dtype))
+
     def test_equals(self):
 
         i = Float64Index([1.0, 2.0])