Skip to content

Commit afde718

Browse files
pijuchajreback
authored andcommitted
BUG: Fix #13149 and ENH: 'copy' param in Index.astype()
closes #13149 1. Float64Index.astype(int) raises ValueError if a NaN is present. Previously, it converted NaN's to the smallest negative integer. 2. TimedeltaIndex.astype(int) and DatetimeIndex.astype(int) return. Int64Index, which is consistent with behavior of other Indexes. Previously, they returned a numpy.array of ints. 3. Added bool parameter 'copy' to Index.astype() 4. Fixed core.common.is_timedelta64_ns_dtype(). 5. Set a default NaT representation to a string type in a parameter of DatetimeIndex._format_native_types(). Previously, it produced a unicode u'NaT' in Python2. Author: pijucha <[email protected]> Closes #13209 from pijucha/bug13149 and squashes the following commits: 8b29902 [pijucha] BUG: Fix #13149 and ENH: 'copy' param in Index.astype()
1 parent f8a11dd commit afde718

17 files changed

+330
-132
lines changed

doc/source/whatsnew/v0.18.2.txt

+6
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ Other enhancements
4949

5050
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
5151

52+
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
53+
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
54+
5255
.. _whatsnew_0182.api:
5356

5457
API changes
@@ -143,6 +146,9 @@ This will now convert integers/floats with the default unit of ``ns``.
143146
Other API changes
144147
^^^^^^^^^^^^^^^^^
145148

149+
- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`)
150+
- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`)
151+
146152
.. _whatsnew_0182.deprecations:
147153

148154
Deprecations

pandas/core/categorical.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -336,11 +336,26 @@ def copy(self):
336336
categories=self.categories, ordered=self.ordered,
337337
fastpath=True)
338338

339-
def astype(self, dtype):
340-
""" coerce this type to another dtype """
339+
def astype(self, dtype, copy=True):
340+
"""
341+
Coerce this type to another dtype
342+
343+
Parameters
344+
----------
345+
dtype : numpy dtype or pandas type
346+
copy : bool, default True
347+
By default, astype always returns a newly allocated object.
348+
If copy is set to False and dtype is categorical, the original
349+
object is returned.
350+
351+
.. versionadded:: 0.18.2
352+
353+
"""
341354
if is_categorical_dtype(dtype):
355+
if copy is True:
356+
return self.copy()
342357
return self
343-
return np.array(self, dtype=dtype)
358+
return np.array(self, dtype=dtype, copy=copy)
344359

345360
@cache_readonly
346361
def ndim(self):

pandas/core/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,7 @@ def is_timedelta64_dtype(arr_or_dtype):
16001600

16011601

16021602
def is_timedelta64_ns_dtype(arr_or_dtype):
1603-
tipo = _get_dtype_type(arr_or_dtype)
1603+
tipo = _get_dtype(arr_or_dtype)
16041604
return tipo == _TD_DTYPE
16051605

16061606

pandas/core/ops.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ def _convert_to_array(self, values, name=None, other=None):
422422
values = tslib.array_to_datetime(values)
423423
elif inferred_type in ('timedelta', 'timedelta64'):
424424
# have a timedelta, convert to to ns here
425-
values = to_timedelta(values, errors='coerce')
425+
values = to_timedelta(values, errors='coerce', box=False)
426426
elif inferred_type == 'integer':
427427
# py3 compat where dtype is 'm' but is an integer
428428
if values.dtype.kind == 'm':
@@ -504,9 +504,9 @@ def _offset(lvalues, rvalues):
504504

505505
# convert Tick DateOffset to underlying delta
506506
if self.is_offset_lhs:
507-
lvalues = to_timedelta(lvalues)
507+
lvalues = to_timedelta(lvalues, box=False)
508508
if self.is_offset_rhs:
509-
rvalues = to_timedelta(rvalues)
509+
rvalues = to_timedelta(rvalues, box=False)
510510

511511
lvalues = lvalues.astype(np.int64)
512512
if not self.is_floating_rhs:

pandas/indexes/base.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -754,8 +754,28 @@ def _to_embed(self, keep_tz=False):
754754
"""
755755
return self.values.copy()
756756

757-
def astype(self, dtype):
758-
return Index(self.values.astype(dtype), name=self.name, dtype=dtype)
757+
_index_shared_docs['astype'] = """
758+
Create an Index with values cast to dtypes. The class of a new Index
759+
is determined by dtype. When conversion is impossible, a ValueError
760+
exception is raised.
761+
762+
Parameters
763+
----------
764+
dtype : numpy dtype or pandas type
765+
copy : bool, default True
766+
By default, astype always returns a newly allocated object.
767+
If copy is set to False and internal requirements on dtype are
768+
satisfied, the original data is used to create a new Index
769+
or the original Index is returned.
770+
771+
.. versionadded:: 0.18.2
772+
773+
"""
774+
775+
@Appender(_index_shared_docs['astype'])
776+
def astype(self, dtype, copy=True):
777+
return Index(self.values.astype(dtype, copy=copy), name=self.name,
778+
dtype=dtype)
759779

760780
def _to_safe_for_reshape(self):
761781
""" convert to object if we are a categorical """

pandas/indexes/multi.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -2074,11 +2074,14 @@ def difference(self, other):
20742074
return MultiIndex.from_tuples(difference, sortorder=0,
20752075
names=result_names)
20762076

2077-
def astype(self, dtype):
2077+
@Appender(_index_shared_docs['astype'])
2078+
def astype(self, dtype, copy=True):
20782079
if not is_object_dtype(np.dtype(dtype)):
20792080
raise TypeError('Setting %s dtype to anything other than object '
20802081
'is not supported' % self.__class__)
2081-
return self._shallow_copy()
2082+
elif copy is True:
2083+
return self._shallow_copy()
2084+
return self
20822085

20832086
def _convert_can_do_setop(self, other):
20842087
result_names = self.names

pandas/indexes/numeric.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pandas.index as _index
55

66
from pandas import compat
7-
from pandas.indexes.base import Index, InvalidIndexError
7+
from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs
88
from pandas.util.decorators import Appender, cache_readonly
99
import pandas.core.common as com
1010
from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype,
@@ -238,12 +238,17 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
238238
def inferred_type(self):
239239
return 'floating'
240240

241-
def astype(self, dtype):
241+
@Appender(_index_shared_docs['astype'])
242+
def astype(self, dtype, copy=True):
242243
dtype = pandas_dtype(dtype)
243-
if is_float_dtype(dtype) or is_integer_dtype(dtype):
244-
values = self._values.astype(dtype)
244+
if is_float_dtype(dtype):
245+
values = self._values.astype(dtype, copy=copy)
246+
elif is_integer_dtype(dtype):
247+
if self.hasnans:
248+
raise ValueError('cannot convert float NaN to integer')
249+
values = self._values.astype(dtype, copy=copy)
245250
elif is_object_dtype(dtype):
246-
values = self._values
251+
values = self._values.astype('object', copy=copy)
247252
else:
248253
raise TypeError('Setting %s dtype to anything other than '
249254
'float64 or object is not supported' %

pandas/tests/indexes/test_datetimelike.py

+195-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
import numpy as np
66

7-
from pandas import (date_range, period_range,
8-
Series, Index, DatetimeIndex,
9-
TimedeltaIndex, PeriodIndex)
7+
from pandas import (DatetimeIndex, Float64Index, Index, Int64Index,
8+
NaT, Period, PeriodIndex, Series, Timedelta,
9+
TimedeltaIndex, date_range, period_range,
10+
timedelta_range)
1011

1112
import pandas.util.testing as tm
1213

@@ -337,6 +338,117 @@ def test_construction_dti_with_mixed_timezones(self):
337338
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
338339
tz='US/Eastern', name='idx')
339340

341+
def test_astype(self):
342+
# GH 13149, GH 13209
343+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
344+
345+
result = idx.astype(object)
346+
expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object)
347+
tm.assert_index_equal(result, expected)
348+
349+
result = idx.astype(int)
350+
expected = Int64Index([1463356800000000000] +
351+
[-9223372036854775808] * 3, dtype=np.int64)
352+
tm.assert_index_equal(result, expected)
353+
354+
rng = date_range('1/1/2000', periods=10)
355+
result = rng.astype('i8')
356+
self.assert_numpy_array_equal(result, rng.asi8)
357+
358+
def test_astype_with_tz(self):
359+
360+
# with tz
361+
rng = date_range('1/1/2000', periods=10, tz='US/Eastern')
362+
result = rng.astype('datetime64[ns]')
363+
expected = (date_range('1/1/2000', periods=10,
364+
tz='US/Eastern')
365+
.tz_convert('UTC').tz_localize(None))
366+
tm.assert_index_equal(result, expected)
367+
368+
# BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex
369+
result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str)
370+
expected = pd.Series(
371+
['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object)
372+
tm.assert_series_equal(result, expected)
373+
374+
result = Series(pd.date_range('2012-01-01', periods=3,
375+
tz='US/Eastern')).astype(str)
376+
expected = Series(['2012-01-01 00:00:00-05:00',
377+
'2012-01-02 00:00:00-05:00',
378+
'2012-01-03 00:00:00-05:00'],
379+
dtype=object)
380+
tm.assert_series_equal(result, expected)
381+
382+
def test_astype_str_compat(self):
383+
# GH 13149, GH 13209
384+
# verify that we are returing NaT as a string (and not unicode)
385+
386+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
387+
result = idx.astype(str)
388+
expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object)
389+
tm.assert_index_equal(result, expected)
390+
391+
def test_astype_str(self):
392+
# test astype string - #10442
393+
result = date_range('2012-01-01', periods=4,
394+
name='test_name').astype(str)
395+
expected = Index(['2012-01-01', '2012-01-02', '2012-01-03',
396+
'2012-01-04'], name='test_name', dtype=object)
397+
tm.assert_index_equal(result, expected)
398+
399+
# test astype string with tz and name
400+
result = date_range('2012-01-01', periods=3, name='test_name',
401+
tz='US/Eastern').astype(str)
402+
expected = Index(['2012-01-01 00:00:00-05:00',
403+
'2012-01-02 00:00:00-05:00',
404+
'2012-01-03 00:00:00-05:00'],
405+
name='test_name', dtype=object)
406+
tm.assert_index_equal(result, expected)
407+
408+
# test astype string with freqH and name
409+
result = date_range('1/1/2011', periods=3, freq='H',
410+
name='test_name').astype(str)
411+
expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
412+
'2011-01-01 02:00:00'],
413+
name='test_name', dtype=object)
414+
tm.assert_index_equal(result, expected)
415+
416+
# test astype string with freqH and timezone
417+
result = date_range('3/6/2012 00:00', periods=2, freq='H',
418+
tz='Europe/London', name='test_name').astype(str)
419+
expected = Index(['2012-03-06 00:00:00+00:00',
420+
'2012-03-06 01:00:00+00:00'],
421+
dtype=object, name='test_name')
422+
tm.assert_index_equal(result, expected)
423+
424+
def test_astype_datetime64(self):
425+
# GH 13149, GH 13209
426+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
427+
428+
result = idx.astype('datetime64[ns]')
429+
tm.assert_index_equal(result, idx)
430+
self.assertFalse(result is idx)
431+
432+
result = idx.astype('datetime64[ns]', copy=False)
433+
tm.assert_index_equal(result, idx)
434+
self.assertTrue(result is idx)
435+
436+
idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST')
437+
result = idx_tz.astype('datetime64[ns]')
438+
expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'],
439+
dtype='datetime64[ns]')
440+
tm.assert_index_equal(result, expected)
441+
442+
def test_astype_raises(self):
443+
# GH 13149, GH 13209
444+
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])
445+
446+
self.assertRaises(ValueError, idx.astype, float)
447+
self.assertRaises(ValueError, idx.astype, 'timedelta64')
448+
self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
449+
self.assertRaises(ValueError, idx.astype, 'datetime64')
450+
self.assertRaises(ValueError, idx.astype, 'datetime64[D]')
451+
340452
def test_get_loc(self):
341453
idx = pd.date_range('2000-01-01', periods=3)
342454

@@ -585,6 +697,42 @@ def setUp(self):
585697
def create_index(self):
586698
return period_range('20130101', periods=5, freq='D')
587699

700+
def test_astype(self):
701+
# GH 13149, GH 13209
702+
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
703+
704+
result = idx.astype(object)
705+
expected = Index([Period('2016-05-16', freq='D')] +
706+
[Period(NaT, freq='D')] * 3, dtype='object')
707+
# Hack because of lack of support for Period null checking (GH12759)
708+
tm.assert_index_equal(result[:1], expected[:1])
709+
result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64)
710+
expected_arr = np.asarray([p.ordinal for p in expected],
711+
dtype=np.int64)
712+
tm.assert_numpy_array_equal(result_arr, expected_arr)
713+
# TODO: When GH12759 is resolved, change the above hack to:
714+
# tm.assert_index_equal(result, expected) # now, it raises.
715+
716+
result = idx.astype(int)
717+
expected = Int64Index([16937] + [-9223372036854775808] * 3,
718+
dtype=np.int64)
719+
tm.assert_index_equal(result, expected)
720+
721+
idx = period_range('1990', '2009', freq='A')
722+
result = idx.astype('i8')
723+
self.assert_numpy_array_equal(result, idx.values)
724+
725+
def test_astype_raises(self):
726+
# GH 13149, GH 13209
727+
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
728+
729+
self.assertRaises(ValueError, idx.astype, str)
730+
self.assertRaises(ValueError, idx.astype, float)
731+
self.assertRaises(ValueError, idx.astype, 'timedelta64')
732+
self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]')
733+
self.assertRaises(ValueError, idx.astype, 'datetime64')
734+
self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')
735+
588736
def test_shift(self):
589737

590738
# test shift for PeriodIndex
@@ -726,6 +874,50 @@ def test_shift(self):
726874
'10 days 01:00:03'], freq='D')
727875
self.assert_index_equal(result, expected)
728876

877+
def test_astype(self):
878+
# GH 13149, GH 13209
879+
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
880+
881+
result = idx.astype(object)
882+
expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3,
883+
dtype=object)
884+
tm.assert_index_equal(result, expected)
885+
886+
result = idx.astype(int)
887+
expected = Int64Index([100000000000000] + [-9223372036854775808] * 3,
888+
dtype=np.int64)
889+
tm.assert_index_equal(result, expected)
890+
891+
rng = timedelta_range('1 days', periods=10)
892+
893+
result = rng.astype('i8')
894+
self.assert_numpy_array_equal(result, rng.asi8)
895+
896+
def test_astype_timedelta64(self):
897+
# GH 13149, GH 13209
898+
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
899+
900+
result = idx.astype('timedelta64')
901+
expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64')
902+
tm.assert_index_equal(result, expected)
903+
904+
result = idx.astype('timedelta64[ns]')
905+
tm.assert_index_equal(result, idx)
906+
self.assertFalse(result is idx)
907+
908+
result = idx.astype('timedelta64[ns]', copy=False)
909+
tm.assert_index_equal(result, idx)
910+
self.assertTrue(result is idx)
911+
912+
def test_astype_raises(self):
913+
# GH 13149, GH 13209
914+
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])
915+
916+
self.assertRaises(ValueError, idx.astype, float)
917+
self.assertRaises(ValueError, idx.astype, str)
918+
self.assertRaises(ValueError, idx.astype, 'datetime64')
919+
self.assertRaises(ValueError, idx.astype, 'datetime64[ns]')
920+
729921
def test_get_loc(self):
730922
idx = pd.to_timedelta(['0 days', '1 days', '2 days'])
731923

pandas/tests/indexes/test_numeric.py

+5
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,11 @@ def test_astype(self):
259259
for dtype in ['M8[ns]', 'm8[ns]']:
260260
self.assertRaises(TypeError, lambda: i.astype(dtype))
261261

262+
# GH 13149
263+
for dtype in ['int16', 'int32', 'int64']:
264+
i = Float64Index([0, 1.1, np.NAN])
265+
self.assertRaises(ValueError, lambda: i.astype(dtype))
266+
262267
def test_equals(self):
263268

264269
i = Float64Index([1.0, 2.0])

0 commit comments

Comments
 (0)