Skip to content

Commit a797b28

Browse files
fulmicotonjreback
authored andcommitted
Closes pandas-dev#7758 - astype(unicode) returning unicode.
Just calls numpy.unicode on all the values. Seems to work alright on python2 and python3. Added bugfix of pandas-dev#7758 to v0.15.0 changelog.
1 parent 0a43c0c commit a797b28

File tree

4 files changed

+44
-3
lines changed

4 files changed

+44
-3
lines changed

doc/source/v0.15.0.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,7 @@ There are no experimental changes in 0.15.0
185185
Bug Fixes
186186
~~~~~~~~~
187187

188-
189-
188+
- Bug in ``Series.astype("unicode")`` not calling ``unicode`` on the values correctly (:issue:`7758`)
190189

191190

192191

pandas/core/common.py

+3
Original file line numberDiff line numberDiff line change
@@ -2492,6 +2492,9 @@ def _astype_nansafe(arr, dtype, copy=True):
24922492
elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
24932493
# work around NumPy brokenness, #1987
24942494
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
2495+
elif issubclass(dtype.type, compat.text_type):
2496+
# in Py3 that's str, in Py2 that's unicode
2497+
return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
24952498
elif issubclass(dtype.type, compat.string_types):
24962499
return lib.astype_str(arr.ravel()).reshape(arr.shape)
24972500

pandas/lib.pyx

+10
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,16 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
781781

782782
return result
783783

784+
cpdef ndarray[object] astype_unicode(ndarray arr):
785+
cdef:
786+
Py_ssize_t i, n = arr.size
787+
ndarray[object] result = np.empty(n, dtype=object)
788+
789+
for i in range(n):
790+
util.set_value_at(result, i, unicode(arr[i]))
791+
792+
return result
793+
784794
cpdef ndarray[object] astype_str(ndarray arr):
785795
cdef:
786796
Py_ssize_t i, n = arr.size

pandas/tests/test_series.py

+30-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# pylint: disable-msg=E1101,W0612
23

34
import sys
@@ -4851,13 +4852,41 @@ def test_astype_str(self):
48514852
s1 = Series([digits * 10, tm.rands(63), tm.rands(64),
48524853
tm.rands(1000)])
48534854
s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0])
4854-
types = (compat.text_type,) + (np.str_, np.unicode_)
4855+
types = (compat.text_type, np.str_)
48554856
for typ in types:
48564857
for s in (s1, s2):
48574858
res = s.astype(typ)
48584859
expec = s.map(compat.text_type)
48594860
assert_series_equal(res, expec)
48604861

4862+
def test_astype_unicode(self):
4863+
# a bit of magic is required to set default encoding encoding to utf-8
4864+
digits = string.digits
4865+
test_series = [
4866+
Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]),
4867+
Series([u"データーサイエンス、お前はもう死んでいる"]),
4868+
4869+
]
4870+
4871+
former_encoding = None
4872+
if not compat.PY3:
4873+
# in python we can force the default encoding
4874+
# for this test
4875+
former_encoding = sys.getdefaultencoding()
4876+
reload(sys)
4877+
sys.setdefaultencoding("utf-8")
4878+
if sys.getdefaultencoding() == "utf-8":
4879+
test_series.append(Series([u"野菜食べないとやばい".encode("utf-8")]))
4880+
for s in test_series:
4881+
res = s.astype("unicode")
4882+
expec = s.map(compat.text_type)
4883+
assert_series_equal(res, expec)
4884+
# restore the former encoding
4885+
if former_encoding is not None and former_encoding != "utf-8":
4886+
reload(sys)
4887+
sys.setdefaultencoding(former_encoding)
4888+
4889+
48614890
def test_map(self):
48624891
index, data = tm.getMixedTypeDict()
48634892

0 commit comments

Comments
 (0)