Skip to content

Commit 03e0947

Browse files
author
Vikram Bhandoh
committed
Partially fixes GH8732
In most cases it looks like, we need to iterate over array and coerce each element. This is so that the appropriate exception can be raised, or we can deal with nulls. So the original case of casting ints to strings, has to work the way it does, unless we change the underlying behaviour. So when astype(str) is called on ints. Then each element is first cast as a string then made into a numpy object. If we relied on numpy it wouldn't cast it to string, just return it as an object. This breaks existing behaviour. It is possible to bypass iterating over the array, when we are coercing to int. Assuming that there are no NaNs and the type of the array is a numeric.
1 parent 2063c1f commit 03e0947

File tree

6 files changed

+63
-4
lines changed

6 files changed

+63
-4
lines changed

pandas/core/common.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2585,14 +2585,15 @@ def _astype_nansafe(arr, dtype, copy=True):
25852585
if np.isnan(arr).any():
25862586
raise ValueError('Cannot convert NA to integer')
25872587
elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
2588+
# partially address #8732
2589+
iterate_over = isnull(arr).any() or not is_numeric_dtype(arr.dtype)
25882590
# work around NumPy brokenness, #1987
2589-
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
2591+
return lib.astype_intsafe(arr.ravel(), dtype, iterate_over).reshape(arr.shape)
25902592
elif issubclass(dtype.type, compat.text_type):
25912593
# in Py3 that's str, in Py2 that's unicode
25922594
return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
25932595
elif issubclass(dtype.type, compat.string_types):
25942596
return lib.astype_str(arr.ravel()).reshape(arr.shape)
2595-
25962597
if copy:
25972598
return arr.astype(dtype)
25982599
return arr.view(dtype)

pandas/lib.pyx

+4-1
Original file line numberDiff line numberDiff line change
@@ -827,7 +827,7 @@ def vec_binop(ndarray[object] left, ndarray[object] right, object op):
827827
return maybe_convert_bool(result)
828828

829829

830-
def astype_intsafe(ndarray[object] arr, new_dtype):
830+
def astype_intsafe(ndarray[object] arr, new_dtype, iterate_over):
831831
cdef:
832832
Py_ssize_t i, n = len(arr)
833833
object v
@@ -837,6 +837,9 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
837837
# on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird
838838
is_datelike = new_dtype in ['M8[ns]','m8[ns]']
839839

840+
if not is_datelike and not iterate_over:
841+
return arr.astype(new_dtype)
842+
840843
result = np.empty(n, dtype=new_dtype)
841844
for i in range(n):
842845
v = arr[i]

pandas/tests/test_series.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -560,11 +560,27 @@ def test_scalar_conversion(self):
560560
def test_astype(self):
561561
s = Series(np.random.randn(5),name='foo')
562562

563-
for dtype in ['float32','float64','int64','int32']:
563+
for dtype in ['float32','float64','int64','int32', 'object']:
564564
astyped = s.astype(dtype)
565565
self.assertEqual(astyped.dtype, dtype)
566566
self.assertEqual(astyped.name, s.name)
567567

568+
def test_astype_to(self):
569+
arr = np.random.randint(1, 10, size=100)
570+
s = Series(arr)
571+
for dtype in ['float32', 'float64', 'int64', 'int32', 'object']:
572+
astyped = s.astype(dtype)
573+
self.assertEqual(astyped.dtype, dtype)
574+
575+
def test_astype_int(self):
576+
s = Series([1, 1.01, 1.02, 1.03])
577+
astyped = s.astype(np.int64)
578+
self.assertEqual(astyped.dtype, np.int64)
579+
s = Series([1, 1.01, 1.02, 1.03, np.nan])
580+
self.assertRaises(ValueError, s.astype, np.int64)
581+
s = Series(['1', '1.01', 1.02, 1.03, np.nan])
582+
self.assertRaises(ValueError, s.astype, np.int64)
583+
568584
def test_constructor(self):
569585
# Recognize TimeSeries
570586
self.assertTrue(self.ts.is_time_series)

pandas/util/testing.py

+14
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,20 @@ def getMixedTypeDict():
894894

895895
return index, data
896896

897+
898+
def makeMixedDataFrameWithNaN():
899+
index = Index(['a', 'b', 'c', 'd', 'e', 'f'])
900+
901+
data = {
902+
'A': [0., 1., 2., 3., 4., np.nan],
903+
'B': [0., 1., 0., 1., 0., np.nan],
904+
'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5', np.nan],
905+
'D': bdate_range('1/1/2009', periods=6)
906+
}
907+
908+
return DataFrame(data, index=index)
909+
910+
897911
def makeMixedDataFrame():
898912
return DataFrame(getMixedTypeDict()[1])
899913

vb_suite/astype.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from vbench.api import Benchmark
2+
3+
common_setup = """from pandas_vb_common import *
4+
from datetime import timedelta
5+
import pandas as pd
6+
import numpy as np
7+
8+
N = 1000000
9+
df = pd.DataFrame({'a': 1.,
10+
'b': 2,
11+
'c': 'foo',
12+
'float32' : np.array([1.]*N,dtype='float32'),
13+
'int32' : np.array([1]*N,dtype='int32'),
14+
},
15+
index=np.arange(N))
16+
17+
mn = df._get_numeric_data()
18+
mn['little_float'] = np.array(12345.,dtype='float16')
19+
mn['big_float'] = np.array(123456789101112.,dtype='float64')
20+
"""
21+
22+
astype_test = Benchmark('s.astype(np.int64)',
23+
common_setup,
24+
name='astype_test')

vb_suite/suite.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
'indexing',
1414
'io_bench',
1515
'io_sql',
16+
'astype',
1617
'inference',
1718
'hdfstore_bench',
1819
'join_merge',

0 commit comments

Comments
 (0)