Closes pandas-dev#7758 - astype(unicode) returning unicode.

fulmicoton · jreback · commit a797b28c87d9 · 2014-07-16T07:56:25.000-04:00
Just calls numpy.unicode on all the values. Seems to work alright on python2 and python3. Added bugfix of pandas-dev#7758 to v0.15.0 changelog.
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -185,8 +185,7 @@ There are no experimental changes in 0.15.0
 Bug Fixes
 ~~~~~~~~~
 
-
-
+- Bug in ``Series.astype("unicode")`` not calling ``unicode`` on the values correctly (:issue:`7758`)
 
 
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2492,6 +2492,9 @@ def _astype_nansafe(arr, dtype, copy=True):
     elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
         # work around NumPy brokenness, #1987
         return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
+    elif issubclass(dtype.type, compat.text_type):
+        # in Py3 that's str, in Py2 that's unicode
+        return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
     elif issubclass(dtype.type, compat.string_types):
         return lib.astype_str(arr.ravel()).reshape(arr.shape)
 
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -781,6 +781,16 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
 
     return result
 
+cpdef ndarray[object] astype_unicode(ndarray arr):
+    cdef:
+        Py_ssize_t i, n = arr.size
+        ndarray[object] result = np.empty(n, dtype=object)
+
+    for i in range(n):
+        util.set_value_at(result, i, unicode(arr[i]))
+
+    return result
+
 cpdef ndarray[object] astype_str(ndarray arr):
     cdef:
         Py_ssize_t i, n = arr.size
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # pylint: disable-msg=E1101,W0612
 
 import sys
@@ -4851,13 +4852,41 @@ def test_astype_str(self):
         s1 = Series([digits * 10, tm.rands(63), tm.rands(64),
                     tm.rands(1000)])
         s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0])
-        types = (compat.text_type,) + (np.str_, np.unicode_)
+        types = (compat.text_type, np.str_)
         for typ in types:
             for s in (s1, s2):
                 res = s.astype(typ)
                 expec = s.map(compat.text_type)
                 assert_series_equal(res, expec)
 
+    def test_astype_unicode(self):
+        # a bit of magic is required to set default encoding encoding to utf-8
+        digits = string.digits
+        test_series = [
+            Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]),
+            Series([u"データーサイエンス、お前はもう死んでいる"]),
+            
+        ]
+        
+        former_encoding = None
+        if not compat.PY3:
+            # in python we can force the default encoding 
+            # for this test
+            former_encoding = sys.getdefaultencoding()
+            reload(sys)
+            sys.setdefaultencoding("utf-8")
+        if sys.getdefaultencoding() == "utf-8":
+            test_series.append(Series([u"野菜食べないとやばい".encode("utf-8")]))
+        for s in test_series:
+            res = s.astype("unicode")
+            expec = s.map(compat.text_type)
+            assert_series_equal(res, expec)
+        # restore the former encoding
+        if former_encoding is not None and former_encoding != "utf-8":
+            reload(sys)
+            sys.setdefaultencoding(former_encoding)
+
+
     def test_map(self):
         index, data = tm.getMixedTypeDict()