Skip to content

Commit 8b29902

Browse files
committed
BUG: Fix #13149 and ENH: 'copy' param in Index.astype()
1. Float64Index.astype(int) raises ValueError if a NaN is present. Previously, it converted NaN's to the smallest negative integer. 2. TimedeltaIndex.astype(int) and DatetimeIndex.astype(int) return Int64Index, which is consistent with behavior of other Indexes. Previously, they returned a numpy.array of ints. 3. Added: - bool parameter 'copy' to Index.astype() - shared doc string to .astype() - tests on .astype() (consolidated and added new) - bool parameter 'copy' to Categorical.astype() 4. Internals: - Fixed core.common.is_timedelta64_ns_dtype(). - Set a default NaT representation to a string type in a parameter of DatetimeIndex._format_native_types(). Previously, it produced a unicode u'NaT' in Python2.
1 parent f8a11dd commit 8b29902

17 files changed

+340
-132
lines changed

doc/source/whatsnew/v0.18.2.txt

+7
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ Other enhancements
4949

5050
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
5151

52+
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which has an effect if requirements on dtype are satisfied (:issue:`13209`)
53+
54+
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
55+
5256
.. _whatsnew_0182.api:
5357

5458
API changes
@@ -143,6 +147,9 @@ This will now convert integers/floats with the default unit of ``ns``.
143147
Other API changes
144148
^^^^^^^^^^^^^^^^^
145149

150+
- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`)
151+
- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`)
152+
146153
.. _whatsnew_0182.deprecations:
147154

148155
Deprecations

pandas/core/categorical.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -336,11 +336,26 @@ def copy(self):
336336
categories=self.categories, ordered=self.ordered,
337337
fastpath=True)
338338

339-
def astype(self, dtype):
340-
""" coerce this type to another dtype """
339+
def astype(self, dtype, copy=True):
340+
"""
341+
Coerce this type to another dtype
342+
343+
Parameters
344+
----------
345+
dtype : numpy dtype or pandas type
346+
copy : bool, default True
347+
By default, astype always returns a newly allocated object.
348+
If copy is set to False and dtype is categorical, the original
349+
object is returned.
350+
351+
.. versionadded:: 0.18.2
352+
353+
"""
341354
if is_categorical_dtype(dtype):
355+
if copy is True:
356+
return self.copy()
342357
return self
343-
return np.array(self, dtype=dtype)
358+
return np.array(self, dtype=dtype, copy=copy)
344359

345360
@cache_readonly
346361
def ndim(self):

pandas/core/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,7 @@ def is_timedelta64_dtype(arr_or_dtype):
16001600

16011601

16021602
def is_timedelta64_ns_dtype(arr_or_dtype):
1603-
tipo = _get_dtype_type(arr_or_dtype)
1603+
tipo = _get_dtype(arr_or_dtype)
16041604
return tipo == _TD_DTYPE
16051605

16061606

pandas/core/ops.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ def _convert_to_array(self, values, name=None, other=None):
422422
values = tslib.array_to_datetime(values)
423423
elif inferred_type in ('timedelta', 'timedelta64'):
424424
# have a timedelta, convert to to ns here
425-
values = to_timedelta(values, errors='coerce')
425+
values = to_timedelta(values, errors='coerce', box=False)
426426
elif inferred_type == 'integer':
427427
# py3 compat where dtype is 'm' but is an integer
428428
if values.dtype.kind == 'm':
@@ -504,9 +504,9 @@ def _offset(lvalues, rvalues):
504504

505505
# convert Tick DateOffset to underlying delta
506506
if self.is_offset_lhs:
507-
lvalues = to_timedelta(lvalues)
507+
lvalues = to_timedelta(lvalues, box=False)
508508
if self.is_offset_rhs:
509-
rvalues = to_timedelta(rvalues)
509+
rvalues = to_timedelta(rvalues, box=False)
510510

511511
lvalues = lvalues.astype(np.int64)
512512
if not self.is_floating_rhs:

pandas/indexes/base.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -754,8 +754,28 @@ def _to_embed(self, keep_tz=False):
754754
"""
755755
return self.values.copy()
756756

757-
def astype(self, dtype):
758-
return Index(self.values.astype(dtype), name=self.name, dtype=dtype)
757+
_index_shared_docs['astype'] = """
758+
Create an Index with values cast to dtypes. The class of a new Index
759+
is determined by dtype. When conversion is impossible, a ValueError
760+
exception is raised.
761+
762+
Parameters
763+
----------
764+
dtype : numpy dtype or pandas type
765+
copy : bool, default True
766+
By default, astype always returns a newly allocated object.
767+
If copy is set to False and internal requirements on dtype are
768+
satisfied, the original data is used to create a new Index
769+
or the original Index is returned.
770+
771+
.. versionadded:: 0.18.2
772+
773+
"""
774+
775+
@Appender(_index_shared_docs['astype'])
776+
def astype(self, dtype, copy=True):
777+
return Index(self.values.astype(dtype, copy=copy), name=self.name,
778+
dtype=dtype)
759779

760780
def _to_safe_for_reshape(self):
761781
""" convert to object if we are a categorical """

pandas/indexes/multi.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -2074,11 +2074,14 @@ def difference(self, other):
20742074
return MultiIndex.from_tuples(difference, sortorder=0,
20752075
names=result_names)
20762076

2077-
def astype(self, dtype):
2077+
@Appender(_index_shared_docs['astype'])
2078+
def astype(self, dtype, copy=True):
20782079
if not is_object_dtype(np.dtype(dtype)):
20792080
raise TypeError('Setting %s dtype to anything other than object '
20802081
'is not supported' % self.__class__)
2081-
return self._shallow_copy()
2082+
elif copy is True:
2083+
return self._shallow_copy()
2084+
return self
20822085

20832086
def _convert_can_do_setop(self, other):
20842087
result_names = self.names

pandas/indexes/numeric.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pandas.index as _index
55

66
from pandas import compat
7-
from pandas.indexes.base import Index, InvalidIndexError
7+
from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs
88
from pandas.util.decorators import Appender, cache_readonly
99
import pandas.core.common as com
1010
from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype,
@@ -238,12 +238,17 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
238238
def inferred_type(self):
239239
return 'floating'
240240

241-
def astype(self, dtype):
241+
@Appender(_index_shared_docs['astype'])
242+
def astype(self, dtype, copy=True):
242243
dtype = pandas_dtype(dtype)
243-
if is_float_dtype(dtype) or is_integer_dtype(dtype):
244-
values = self._values.astype(dtype)
244+
if is_float_dtype(dtype):
245+
values = self._values.astype(dtype, copy=copy)
246+
elif is_integer_dtype(dtype):
247+
if self.hasnans:
248+
raise ValueError('cannot convert float NaN to integer')
249+
values = self._values.astype(dtype, copy=copy)
245250
elif is_object_dtype(dtype):
246-
values = self._values
251+
values = self._values.astype('object', copy=copy)
247252
else:
248253
raise TypeError('Setting %s dtype to anything other than '
249254
'float64 or object is not supported' %

pandas/tests/indexes/test_datetimelike.py

+204-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
import numpy as np
66

7-
from pandas import (date_range, period_range,
8-
Series, Index, DatetimeIndex,
9-
TimedeltaIndex, PeriodIndex)
7+
from pandas import (DatetimeIndex, Float64Index, Index, Int64Index,
8+
NaT, Period, PeriodIndex, Series, Timedelta,
9+
TimedeltaIndex, date_range, period_range,
10+
timedelta_range)
1011

1112
import pandas.util.testing as tm
1213

@@ -849,3 +850,203 @@ def test_fillna_timedelta(self):
849850
exp = pd.Index(
850851
[pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object)
851852
self.assert_index_equal(idx.fillna('x'), exp)
853+
854+
855+
class TestAstype(tm.TestCase):
856+
857+
def test_DatetimeIndex_astype(self):
858+
# GH 13149, GH 13209
859+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
860+
861+
result = idx.astype(object)
862+
expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object)
863+
tm.assert_index_equal(result, expected)
864+
865+
result = idx.astype(int)
866+
expected = Int64Index([1463356800000000000] +
867+
[-9223372036854775808] * 3, dtype=np.int64)
868+
tm.assert_index_equal(result, expected)
869+
870+
def test_DatetimeIndex_astype_str(self):
871+
# GH 13149, GH 13209
872+
# Also: Previously, Python2 returned a unicode representation u'NaT',
873+
# instead of a string, due to a default parameter na_rep=u('NaT') in
874+
# DatetimeIndex._format_native_types(). Consequently, 'result' had
875+
# a mixed inferred type and failed tm.assert_index_equal().
876+
877+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
878+
result = idx.astype(str)
879+
expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object)
880+
tm.assert_index_equal(result, expected)
881+
882+
def test_DatetimeIndex_astype_datetime64(self):
883+
# GH 13149, GH 13209
884+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
885+
886+
result = idx.astype('datetime64[ns]')
887+
tm.assert_index_equal(result, idx)
888+
self.assertFalse(result is idx)
889+
890+
result = idx.astype('datetime64[ns]', copy=False)
891+
tm.assert_index_equal(result, idx)
892+
self.assertTrue(result is idx)
893+
894+
idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST')
895+
result = idx_tz.astype('datetime64[ns]')
896+
expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'],
897+
dtype='datetime64[ns]')
898+
tm.assert_index_equal(result, expected)
899+
900+
def test_DatetimeIndex_astype_raises(self):
901+
# GH 13149, GH 13209
902+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
903+
904+
self.assertRaises(ValueError, idx.astype, float)
905+
self.assertRaises(ValueError, idx.astype, 'timedelta64')
906+
self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
907+
self.assertRaises(ValueError, idx.astype, 'datetime64')
908+
self.assertRaises(ValueError, idx.astype, 'datetime64[D]')
909+
910+
def test_date_range(self):
911+
rng = date_range('1/1/2000', periods=10)
912+
913+
result = rng.astype('i8')
914+
self.assert_numpy_array_equal(result, rng.asi8)
915+
916+
# with tz
917+
rng = date_range('1/1/2000', periods=10, tz='US/Eastern')
918+
result = rng.astype('datetime64[ns]')
919+
expected = (date_range('1/1/2000', periods=10,
920+
tz='US/Eastern')
921+
.tz_convert('UTC').tz_localize(None))
922+
tm.assert_index_equal(result, expected)
923+
924+
# BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex
925+
result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str)
926+
expected = pd.Series(
927+
['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object)
928+
tm.assert_series_equal(result, expected)
929+
930+
result = Series(pd.date_range('2012-01-01', periods=3,
931+
tz='US/Eastern')).astype(str)
932+
expected = Series(['2012-01-01 00:00:00-05:00',
933+
'2012-01-02 00:00:00-05:00',
934+
'2012-01-03 00:00:00-05:00'],
935+
dtype=object)
936+
tm.assert_series_equal(result, expected)
937+
938+
def test_DatetimeIndexOps_astype_str(self):
939+
# test astype string - #10442
940+
result = date_range('2012-01-01', periods=4,
941+
name='test_name').astype(str)
942+
expected = Index(['2012-01-01', '2012-01-02', '2012-01-03',
943+
'2012-01-04'], name='test_name', dtype=object)
944+
tm.assert_index_equal(result, expected)
945+
946+
# test astype string with tz and name
947+
result = date_range('2012-01-01', periods=3, name='test_name',
948+
tz='US/Eastern').astype(str)
949+
expected = Index(['2012-01-01 00:00:00-05:00',
950+
'2012-01-02 00:00:00-05:00',
951+
'2012-01-03 00:00:00-05:00'],
952+
name='test_name', dtype=object)
953+
tm.assert_index_equal(result, expected)
954+
955+
# test astype string with freqH and name
956+
result = date_range('1/1/2011', periods=3, freq='H',
957+
name='test_name').astype(str)
958+
expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
959+
'2011-01-01 02:00:00'],
960+
name='test_name', dtype=object)
961+
tm.assert_index_equal(result, expected)
962+
963+
# test astype string with freqH and timezone
964+
result = date_range('3/6/2012 00:00', periods=2, freq='H',
965+
tz='Europe/London', name='test_name').astype(str)
966+
expected = Index(['2012-03-06 00:00:00+00:00',
967+
'2012-03-06 01:00:00+00:00'],
968+
dtype=object, name='test_name')
969+
tm.assert_index_equal(result, expected)
970+
971+
def test_TimedeltaIndex_astype(self):
972+
# GH 13149, GH 13209
973+
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
974+
975+
result = idx.astype(object)
976+
expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3,
977+
dtype=object)
978+
tm.assert_index_equal(result, expected)
979+
980+
result = idx.astype(int)
981+
expected = Int64Index([100000000000000] + [-9223372036854775808] * 3,
982+
dtype=np.int64)
983+
tm.assert_index_equal(result, expected)
984+
985+
def test_TimedeltaIndex_astype_timedelta64(self):
986+
# GH 13149, GH 13209
987+
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
988+
989+
result = idx.astype('timedelta64')
990+
expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64')
991+
tm.assert_index_equal(result, expected)
992+
993+
result = idx.astype('timedelta64[ns]')
994+
tm.assert_index_equal(result, idx)
995+
self.assertFalse(result is idx)
996+
997+
result = idx.astype('timedelta64[ns]', copy=False)
998+
tm.assert_index_equal(result, idx)
999+
self.assertTrue(result is idx)
1000+
1001+
def test_TimedeltaIndex_astype_raises(self):
1002+
# GH 13149, GH 13209
1003+
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
1004+
1005+
self.assertRaises(ValueError, idx.astype, float)
1006+
self.assertRaises(ValueError, idx.astype, str)
1007+
self.assertRaises(ValueError, idx.astype, 'datetime64')
1008+
self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')
1009+
1010+
def test_timedelta_range(self):
1011+
rng = timedelta_range('1 days', periods=10)
1012+
1013+
result = rng.astype('i8')
1014+
self.assert_numpy_array_equal(result, rng.asi8)
1015+
1016+
def test_PeriodIndex(self):
1017+
# GH 13149, GH 13209
1018+
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
1019+
1020+
result = idx.astype(object)
1021+
expected = Index([Period('2016-05-16', freq='D')] +
1022+
[Period(NaT, freq='D')] * 3, dtype='object')
1023+
# Hack because of lack of support for Period null checking (GH12759)
1024+
tm.assert_index_equal(result[:1], expected[:1])
1025+
result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64)
1026+
expected_arr = np.asarray([p.ordinal for p in expected],
1027+
dtype=np.int64)
1028+
tm.assert_numpy_array_equal(result_arr, expected_arr)
1029+
# TODO: When GH12759 is resolved, change the above hack to:
1030+
# tm.assert_index_equal(result, expected) # now, it raises.
1031+
1032+
result = idx.astype(int)
1033+
expected = Int64Index([16937] + [-9223372036854775808] * 3,
1034+
dtype=np.int64)
1035+
tm.assert_index_equal(result, expected)
1036+
1037+
def test_PeriodIndex_raises(self):
1038+
# GH 13149, GH 13209
1039+
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
1040+
1041+
self.assertRaises(ValueError, idx.astype, str)
1042+
self.assertRaises(ValueError, idx.astype, float)
1043+
self.assertRaises(ValueError, idx.astype, 'timedelta64')
1044+
self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
1045+
self.assertRaises(ValueError, idx.astype, 'datetime64')
1046+
self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')
1047+
1048+
def test_period_range(self):
1049+
idx = period_range('1990', '2009', freq='A')
1050+
1051+
result = idx.astype('i8')
1052+
self.assert_numpy_array_equal(result, idx.values)

pandas/tests/indexes/test_numeric.py

+5
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,11 @@ def test_astype(self):
259259
for dtype in ['M8[ns]', 'm8[ns]']:
260260
self.assertRaises(TypeError, lambda: i.astype(dtype))
261261

262+
# GH 13149
263+
for dtype in ['int16', 'int32', 'int64']:
264+
i = Float64Index([0, 1.1, np.NAN])
265+
self.assertRaises(ValueError, lambda: i.astype(dtype))
266+
262267
def test_equals(self):
263268

264269
i = Float64Index([1.0, 2.0])

0 commit comments

Comments
 (0)