Skip to content

Commit 8b7c22d

Browse files
committed
Merge pull request pandas-dev#10032 from cpcloud/infer-bytes
pd.lib.infer_dtype infers bytes in Python3
2 parents 4b3410f + 3c50809 commit 8b7c22d

File tree

5 files changed

+69
-25
lines changed

5 files changed

+69
-25
lines changed

doc/source/whatsnew/v0.16.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ Enhancements
6868

6969
- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here <ref-subclassing-pandas>`
7070

71+
- ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate :issue:`10032`.
72+
7173
.. _whatsnew_0161.enhancements.categoricalindex:
7274

7375
CategoricalIndex

pandas/compat/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737

3838
PY3 = (sys.version_info[0] >= 3)
3939
PY3_2 = sys.version_info[:2] == (3, 2)
40+
PY2 = sys.version_info[0] == 2
41+
4042

4143
try:
4244
import __builtin__ as builtins

pandas/src/inference.pyx

+42-24
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
import sys
12
cimport util
23
from tslib import NaT
34
from datetime import datetime, timedelta
45
iNaT = util.get_nat()
56

7+
cdef bint PY2 = sys.version_info[0] == 2
8+
69
# core.common import for fast inference checks
710
def is_float(object obj):
811
return util.is_float_object(obj)
@@ -38,10 +41,10 @@ _TYPE_MAP = {
3841
'f' : 'floating',
3942
'complex128': 'complex',
4043
'c' : 'complex',
41-
'string': 'string',
42-
'S' : 'string',
43-
'unicode': 'unicode',
44-
'U' : 'unicode',
44+
'string': 'string' if PY2 else 'bytes',
45+
'S' : 'string' if PY2 else 'bytes',
46+
'unicode': 'unicode' if PY2 else 'string',
47+
'U' : 'unicode' if PY2 else 'string',
4548
'bool': 'boolean',
4649
'b' : 'boolean',
4750
'datetime64[ns]' : 'datetime64',
@@ -181,6 +184,10 @@ def infer_dtype(object _values):
181184
if is_unicode_array(values):
182185
return 'unicode'
183186

187+
elif PyBytes_Check(val):
188+
if is_bytes_array(values):
189+
return 'bytes'
190+
184191
elif is_timedelta(val):
185192
if is_timedelta_or_timedelta64_array(values):
186193
return 'timedelta'
@@ -196,11 +203,6 @@ def infer_dtype(object _values):
196203

197204
return 'mixed'
198205

199-
def infer_dtype_list(list values):
200-
cdef:
201-
Py_ssize_t i, n = len(values)
202-
pass
203-
204206

205207
def is_possible_datetimelike_array(object arr):
206208
# determine if we have a possible datetimelike (or null-like) array
@@ -253,7 +255,6 @@ def is_bool_array(ndarray values):
253255
cdef:
254256
Py_ssize_t i, n = len(values)
255257
ndarray[object] objbuf
256-
object obj
257258

258259
if issubclass(values.dtype.type, np.bool_):
259260
return True
@@ -277,7 +278,6 @@ def is_integer_array(ndarray values):
277278
cdef:
278279
Py_ssize_t i, n = len(values)
279280
ndarray[object] objbuf
280-
object obj
281281

282282
if issubclass(values.dtype.type, np.integer):
283283
return True
@@ -298,7 +298,6 @@ def is_integer_float_array(ndarray values):
298298
cdef:
299299
Py_ssize_t i, n = len(values)
300300
ndarray[object] objbuf
301-
object obj
302301

303302
if issubclass(values.dtype.type, np.integer):
304303
return True
@@ -321,7 +320,6 @@ def is_float_array(ndarray values):
321320
cdef:
322321
Py_ssize_t i, n = len(values)
323322
ndarray[object] objbuf
324-
object obj
325323

326324
if issubclass(values.dtype.type, np.floating):
327325
return True
@@ -342,9 +340,9 @@ def is_string_array(ndarray values):
342340
cdef:
343341
Py_ssize_t i, n = len(values)
344342
ndarray[object] objbuf
345-
object obj
346343

347-
if issubclass(values.dtype.type, (np.string_, np.unicode_)):
344+
if ((PY2 and issubclass(values.dtype.type, np.string_)) or
345+
not PY2 and issubclass(values.dtype.type, np.unicode_)):
348346
return True
349347
elif values.dtype == np.object_:
350348
objbuf = values
@@ -363,7 +361,6 @@ def is_unicode_array(ndarray values):
363361
cdef:
364362
Py_ssize_t i, n = len(values)
365363
ndarray[object] objbuf
366-
object obj
367364

368365
if issubclass(values.dtype.type, np.unicode_):
369366
return True
@@ -381,8 +378,29 @@ def is_unicode_array(ndarray values):
381378
return False
382379

383380

381+
def is_bytes_array(ndarray values):
382+
cdef:
383+
Py_ssize_t i, n = len(values)
384+
ndarray[object] objbuf
385+
386+
if issubclass(values.dtype.type, np.bytes_):
387+
return True
388+
elif values.dtype == np.object_:
389+
objbuf = values
390+
391+
if n == 0:
392+
return False
393+
394+
for i in range(n):
395+
if not PyBytes_Check(objbuf[i]):
396+
return False
397+
return True
398+
else:
399+
return False
400+
401+
384402
def is_datetime_array(ndarray[object] values):
385-
cdef int i, null_count = 0, n = len(values)
403+
cdef Py_ssize_t i, null_count = 0, n = len(values)
386404
cdef object v
387405
if n == 0:
388406
return False
@@ -399,7 +417,7 @@ def is_datetime_array(ndarray[object] values):
399417
return null_count != n
400418

401419
def is_datetime64_array(ndarray values):
402-
cdef int i, null_count = 0, n = len(values)
420+
cdef Py_ssize_t i, null_count = 0, n = len(values)
403421
cdef object v
404422
if n == 0:
405423
return False
@@ -416,7 +434,7 @@ def is_datetime64_array(ndarray values):
416434
return null_count != n
417435

418436
def is_timedelta_array(ndarray values):
419-
cdef int i, null_count = 0, n = len(values)
437+
cdef Py_ssize_t i, null_count = 0, n = len(values)
420438
cdef object v
421439
if n == 0:
422440
return False
@@ -431,7 +449,7 @@ def is_timedelta_array(ndarray values):
431449
return null_count != n
432450

433451
def is_timedelta64_array(ndarray values):
434-
cdef int i, null_count = 0, n = len(values)
452+
cdef Py_ssize_t i, null_count = 0, n = len(values)
435453
cdef object v
436454
if n == 0:
437455
return False
@@ -447,7 +465,7 @@ def is_timedelta64_array(ndarray values):
447465

448466
def is_timedelta_or_timedelta64_array(ndarray values):
449467
""" infer with timedeltas and/or nat/none """
450-
cdef int i, null_count = 0, n = len(values)
468+
cdef Py_ssize_t i, null_count = 0, n = len(values)
451469
cdef object v
452470
if n == 0:
453471
return False
@@ -462,7 +480,7 @@ def is_timedelta_or_timedelta64_array(ndarray values):
462480
return null_count != n
463481

464482
def is_date_array(ndarray[object] values):
465-
cdef int i, n = len(values)
483+
cdef Py_ssize_t i, n = len(values)
466484
if n == 0:
467485
return False
468486
for i in range(n):
@@ -471,7 +489,7 @@ def is_date_array(ndarray[object] values):
471489
return True
472490

473491
def is_time_array(ndarray[object] values):
474-
cdef int i, n = len(values)
492+
cdef Py_ssize_t i, n = len(values)
475493
if n == 0:
476494
return False
477495
for i in range(n):
@@ -484,7 +502,7 @@ def is_period(object o):
484502
return isinstance(o,Period)
485503

486504
def is_period_array(ndarray[object] values):
487-
cdef int i, n = len(values)
505+
cdef Py_ssize_t i, n = len(values)
488506
from pandas.tseries.period import Period
489507

490508
if n == 0:

pandas/tests/test_lib.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pandas as pd
77
from pandas.lib import isscalar, item_from_zerodim, max_len_string_array
88
import pandas.util.testing as tm
9-
from pandas.compat import u
9+
from pandas.compat import u, PY2
1010

1111

1212
class TestMisc(tm.TestCase):
@@ -28,6 +28,17 @@ def test_max_len_string_array(self):
2828
tm.assertRaises(TypeError,
2929
lambda: max_len_string_array(arr.astype('U')))
3030

31+
def test_infer_dtype_bytes(self):
32+
compare = 'string' if PY2 else 'bytes'
33+
34+
# string array of bytes
35+
arr = np.array(list('abc'), dtype='S1')
36+
self.assertEqual(pd.lib.infer_dtype(arr), compare)
37+
38+
# object array of bytes
39+
arr = arr.astype(object)
40+
self.assertEqual(pd.lib.infer_dtype(arr), compare)
41+
3142

3243
class TestIsscalar(tm.TestCase):
3344

pandas/tests/test_strings.py

+11
Original file line numberDiff line numberDiff line change
@@ -1802,6 +1802,17 @@ def test_index_str_accessor_visibility(self):
18021802
with self.assertRaisesRegexp(AttributeError, message):
18031803
idx.str
18041804

1805+
def test_method_on_bytes(self):
1806+
lhs = Series(np.array(list('abc'), 'S1').astype(object))
1807+
rhs = Series(np.array(list('def'), 'S1').astype(object))
1808+
if compat.PY3:
1809+
self.assertRaises(TypeError, lhs.str.cat, rhs)
1810+
else:
1811+
result = lhs.str.cat(rhs)
1812+
expected = Series(np.array(['ad', 'be', 'cf'],
1813+
'S2').astype(object))
1814+
tm.assert_series_equal(result, expected)
1815+
18051816

18061817
if __name__ == '__main__':
18071818
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)