Partially fixes GH8732

Vikram Bhandoh · Vikram Bhandoh · commit 03e09478364a · 2014-12-02T21:10:03.000Z
In most cases it looks like, we need to iterate over array
and coerce each element. This is so that the appropriate
exception can be raised, or we can deal with nulls.
So the original case of casting ints to strings, has to
work the way it does, unless we change the underlying behaviour.
So when astype(str) is called on ints. Then each element
is first cast as a string then made into a numpy object.
If we relied on numpy it wouldn't cast it to string, just
return it as an object. This breaks existing behaviour.

It is possible to bypass iterating over the array, when we are
coercing to int. Assuming that there are no NaNs and the type
of the array is a numeric.
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2585,14 +2585,15 @@ def _astype_nansafe(arr, dtype, copy=True):
         if np.isnan(arr).any():
             raise ValueError('Cannot convert NA to integer')
     elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
+        # partially address #8732
+        iterate_over = isnull(arr).any() or not is_numeric_dtype(arr.dtype)
         # work around NumPy brokenness, #1987
-        return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
+        return lib.astype_intsafe(arr.ravel(), dtype, iterate_over).reshape(arr.shape)
     elif issubclass(dtype.type, compat.text_type):
         # in Py3 that's str, in Py2 that's unicode
         return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
     elif issubclass(dtype.type, compat.string_types):
         return lib.astype_str(arr.ravel()).reshape(arr.shape)
-
     if copy:
         return arr.astype(dtype)
     return arr.view(dtype)
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -827,7 +827,7 @@ def vec_binop(ndarray[object] left, ndarray[object] right, object op):
     return maybe_convert_bool(result)
 
 
-def astype_intsafe(ndarray[object] arr, new_dtype):
+def astype_intsafe(ndarray[object] arr, new_dtype, iterate_over):
     cdef:
         Py_ssize_t i, n = len(arr)
         object v
@@ -837,6 +837,9 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
     # on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird
     is_datelike = new_dtype in ['M8[ns]','m8[ns]']
 
+    if not is_datelike and not iterate_over:
+        return arr.astype(new_dtype)
+
     result = np.empty(n, dtype=new_dtype)
     for i in range(n):
         v = arr[i]
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -560,11 +560,27 @@ def test_scalar_conversion(self):
     def test_astype(self):
         s = Series(np.random.randn(5),name='foo')
 
-        for dtype in ['float32','float64','int64','int32']:
+        for dtype in ['float32','float64','int64','int32', 'object']:
             astyped = s.astype(dtype)
             self.assertEqual(astyped.dtype, dtype)
             self.assertEqual(astyped.name, s.name)
 
+    def test_astype_to(self):
+        arr = np.random.randint(1, 10, size=100)
+        s = Series(arr)
+        for dtype in ['float32', 'float64', 'int64', 'int32', 'object']:
+            astyped = s.astype(dtype)
+            self.assertEqual(astyped.dtype, dtype)
+
+    def test_astype_int(self):
+        s = Series([1, 1.01, 1.02, 1.03])
+        astyped = s.astype(np.int64)
+        self.assertEqual(astyped.dtype, np.int64)
+        s = Series([1, 1.01, 1.02, 1.03, np.nan])
+        self.assertRaises(ValueError, s.astype, np.int64)
+        s = Series(['1', '1.01', 1.02, 1.03, np.nan])
+        self.assertRaises(ValueError, s.astype, np.int64)
+
     def test_constructor(self):
         # Recognize TimeSeries
         self.assertTrue(self.ts.is_time_series)
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -894,6 +894,20 @@ def getMixedTypeDict():
 
     return index, data
 
+
+def makeMixedDataFrameWithNaN():
+    index = Index(['a', 'b', 'c', 'd', 'e', 'f'])
+
+    data = {
+        'A': [0., 1., 2., 3., 4., np.nan],
+        'B': [0., 1., 0., 1., 0., np.nan],
+        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5', np.nan],
+        'D': bdate_range('1/1/2009', periods=6)
+    }
+
+    return DataFrame(data, index=index)
+
+
 def makeMixedDataFrame():
     return DataFrame(getMixedTypeDict()[1])
 
diff --git a/vb_suite/astype.py b/vb_suite/astype.py
@@ -0,0 +1,24 @@
+from vbench.api import Benchmark
+
+common_setup = """from pandas_vb_common import *
+from datetime import timedelta
+import pandas as pd
+import numpy as np
+
+N = 1000000
+df = pd.DataFrame({'a': 1.,
+                   'b': 2,
+                   'c': 'foo',
+                   'float32' : np.array([1.]*N,dtype='float32'),
+                   'int32' : np.array([1]*N,dtype='int32'),
+                   },
+                   index=np.arange(N))
+
+mn = df._get_numeric_data()
+mn['little_float'] = np.array(12345.,dtype='float16')
+mn['big_float']    = np.array(123456789101112.,dtype='float64')
+"""
+
+astype_test = Benchmark('s.astype(np.int64)',
+                        common_setup,
+                        name='astype_test')
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
@@ -13,6 +13,7 @@
            'indexing',
            'io_bench',
            'io_sql',
+           'astype',
            'inference',
            'hdfstore_bench',
            'join_merge',