Skip to content

Commit 37feec1

Browse files
committed
Disable bytes on all methods except encode/decode
1 parent e6ec6bc commit 37feec1

File tree

2 files changed

+78
-7
lines changed

2 files changed

+78
-7
lines changed

pandas/core/strings.py

+77-6
Original file line numberDiff line numberDiff line change
@@ -1759,14 +1759,23 @@ def wrapper(self):
17591759

17601760
def _pat_wrapper(f, flags=False, na=False, **kwargs):
17611761
def wrapper1(self, pat):
1762+
if self._inferred_type in ['bytes']:
1763+
raise AttributeError("Cannot use .str.{} with 'bytes' "
1764+
"values".format(f.__name__))
17621765
result = f(self._parent, pat)
17631766
return self._wrap_result(result)
17641767

17651768
def wrapper2(self, pat, flags=0, **kwargs):
1769+
if self._inferred_type in ['bytes']:
1770+
raise AttributeError("Cannot use .str.{} with 'bytes' "
1771+
"values".format(f.__name__))
17661772
result = f(self._parent, pat, flags=flags, **kwargs)
17671773
return self._wrap_result(result)
17681774

17691775
def wrapper3(self, pat, na=np.nan):
1776+
if self._inferred_type in ['bytes']:
1777+
raise AttributeError("Cannot use .str.{} with 'bytes' "
1778+
"values".format(f.__name__))
17701779
result = f(self._parent, pat, na=na)
17711780
return self._wrap_result(result)
17721781

@@ -1803,7 +1812,7 @@ class StringMethods(NoNewAttributesMixin):
18031812
"""
18041813

18051814
def __init__(self, data):
1806-
self._validate(data)
1815+
self._inferred_type = self._validate(data)
18071816
self._is_categorical = is_categorical_dtype(data)
18081817

18091818
# .values.categories works for both Series/Index
@@ -1818,18 +1827,18 @@ def _validate(data):
18181827
raise AttributeError('Can only use .str accessor with Index, '
18191828
'not MultiIndex')
18201829

1821-
# see src/inference.pyx which can contain string values
1830+
# see _libs/lib.pyx for list of inferred types
18221831
allowed_types = ['string', 'unicode', 'empty',
18231832
'mixed', 'mixed-integer']
18241833
if isinstance(data, ABCSeries):
1834+
# needed for str.decode
18251835
allowed_types = allowed_types + ['bytes']
18261836

1827-
data = data.dropna() # missing values mess up type inference
18281837
values = getattr(data, 'values', data) # Series / Index
18291838
values = getattr(values, 'categories', values) # categorical / normal
1830-
inferred_type = lib.infer_dtype(values)
1839+
# missing values mess up type inference -> skip
1840+
inferred_type = lib.infer_dtype(values, skipna=True)
18311841

1832-
# same for Series and Index (that is not MultiIndex)
18331842
if inferred_type not in allowed_types:
18341843
# it's neither a string series/index not a categorical series/index
18351844
# with strings inside the categories.
@@ -1838,7 +1847,8 @@ def _validate(data):
18381847
# have a str dtype (GH 9343 / 13877)
18391848
raise AttributeError("Can only use .str accessor with string "
18401849
"values (i.e. inferred_type is 'string', "
1841-
"'unicode' or 'mixed')")
1850+
"'unicode', 'mixed' or 'empty')")
1851+
return inferred_type
18421852

18431853
def __getitem__(self, key):
18441854
if isinstance(key, slice):
@@ -2188,6 +2198,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
21882198
"""
21892199
from pandas import Index, Series, concat
21902200

2201+
if self._inferred_type in ['mixed', 'mixed-integer', 'bytes']:
2202+
raise AttributeError("Can only use .str.cat with string values "
2203+
"(i.e. inferred_type is 'string', 'unicode' "
2204+
"'empty')")
2205+
21912206
if isinstance(others, compat.string_types):
21922207
raise ValueError("Did you mean to supply a `sep` keyword?")
21932208
if sep is None:
@@ -2396,13 +2411,17 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
23962411
'side': 'beginning',
23972412
'method': 'split'})
23982413
def split(self, pat=None, n=-1, expand=False):
2414+
if self._inferred_type in ['bytes']:
2415+
raise AttributeError("Cannot use .str.split with 'bytes' values")
23992416
result = str_split(self._parent, pat, n=n)
24002417
return self._wrap_result(result, expand=expand)
24012418

24022419
@Appender(_shared_docs['str_split'] % {
24032420
'side': 'end',
24042421
'method': 'rsplit'})
24052422
def rsplit(self, pat=None, n=-1, expand=False):
2423+
if self._inferred_type in ['bytes']:
2424+
raise AttributeError("Cannot use .str.rsplit with 'bytes' values")
24062425
result = str_rsplit(self._parent, pat, n=n)
24072426
return self._wrap_result(result, expand=expand)
24082427

@@ -2493,6 +2512,9 @@ def rsplit(self, pat=None, n=-1, expand=False):
24932512
'also': 'rpartition : Split the string at the last occurrence of `sep`'
24942513
})
24952514
def partition(self, pat=' ', expand=True):
2515+
if self._inferred_type in ['bytes']:
2516+
raise AttributeError("Cannot use .str.partition with "
2517+
"'bytes' values")
24962518
f = lambda x: x.partition(pat)
24972519
result = _na_map(f, self._parent)
24982520
return self._wrap_result(result, expand=expand)
@@ -2504,6 +2526,9 @@ def partition(self, pat=' ', expand=True):
25042526
'also': 'partition : Split the string at the first occurrence of `sep`'
25052527
})
25062528
def rpartition(self, pat=' ', expand=True):
2529+
if self._inferred_type in ['bytes']:
2530+
raise AttributeError("Cannot use .str.rpartition with "
2531+
"'bytes' values")
25072532
f = lambda x: x.rpartition(pat)
25082533
result = _na_map(f, self._parent)
25092534
return self._wrap_result(result, expand=expand)
@@ -2515,6 +2540,8 @@ def get(self, i):
25152540

25162541
@copy(str_join)
25172542
def join(self, sep):
2543+
if self._inferred_type in ['bytes']:
2544+
raise AttributeError("Cannot use .str.join with 'bytes' values")
25182545
result = str_join(self._parent, sep)
25192546
return self._wrap_result(result)
25202547

@@ -2565,14 +2592,20 @@ def pad(self, width, side='left', fillchar=' '):
25652592
@Appender(_shared_docs['str_pad'] % dict(side='left and right',
25662593
method='center'))
25672594
def center(self, width, fillchar=' '):
2595+
if self._inferred_type in ['bytes']:
2596+
raise AttributeError("Cannot use .str.center with 'bytes' values")
25682597
return self.pad(width, side='both', fillchar=fillchar)
25692598

25702599
@Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust'))
25712600
def ljust(self, width, fillchar=' '):
2601+
if self._inferred_type in ['bytes']:
2602+
raise AttributeError("Cannot use .str.ljust with 'bytes' values")
25722603
return self.pad(width, side='right', fillchar=fillchar)
25732604

25742605
@Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust'))
25752606
def rjust(self, width, fillchar=' '):
2607+
if self._inferred_type in ['bytes']:
2608+
raise AttributeError("Cannot use .str.rjust with 'bytes' values")
25762609
return self.pad(width, side='left', fillchar=fillchar)
25772610

25782611
def zfill(self, width):
@@ -2635,21 +2668,29 @@ def zfill(self, width):
26352668
4 NaN
26362669
dtype: object
26372670
"""
2671+
if self._inferred_type in ['bytes']:
2672+
raise AttributeError("Cannot use .str.zfill with 'bytes' values")
26382673
result = str_pad(self._parent, width, side='left', fillchar='0')
26392674
return self._wrap_result(result)
26402675

26412676
@copy(str_slice)
26422677
def slice(self, start=None, stop=None, step=None):
2678+
if self._inferred_type in ['bytes']:
2679+
raise AttributeError("Cannot use .str.slice with 'bytes' values")
26432680
result = str_slice(self._parent, start, stop, step)
26442681
return self._wrap_result(result)
26452682

26462683
@copy(str_slice_replace)
26472684
def slice_replace(self, start=None, stop=None, repl=None):
2685+
if self._inferred_type in ['bytes']:
2686+
raise AttributeError("Cannot use .str.slice_replace with "
2687+
"'bytes' values")
26482688
result = str_slice_replace(self._parent, start, stop, repl)
26492689
return self._wrap_result(result)
26502690

26512691
@copy(str_decode)
26522692
def decode(self, encoding, errors="strict"):
2693+
# need to allow bytes here
26532694
result = str_decode(self._parent, encoding, errors)
26542695
return self._wrap_result(result)
26552696

@@ -2724,28 +2765,39 @@ def encode(self, encoding, errors="strict"):
27242765
@Appender(_shared_docs['str_strip'] % dict(side='left and right sides',
27252766
method='strip'))
27262767
def strip(self, to_strip=None):
2768+
if self._inferred_type in ['bytes']:
2769+
raise AttributeError("Cannot use .str.strip with 'bytes' values")
27272770
result = str_strip(self._parent, to_strip, side='both')
27282771
return self._wrap_result(result)
27292772

27302773
@Appender(_shared_docs['str_strip'] % dict(side='left side',
27312774
method='lstrip'))
27322775
def lstrip(self, to_strip=None):
2776+
if self._inferred_type in ['bytes']:
2777+
raise AttributeError("Cannot use .str.lstrip with 'bytes' values")
27332778
result = str_strip(self._parent, to_strip, side='left')
27342779
return self._wrap_result(result)
27352780

27362781
@Appender(_shared_docs['str_strip'] % dict(side='right side',
27372782
method='rstrip'))
27382783
def rstrip(self, to_strip=None):
2784+
if self._inferred_type in ['bytes']:
2785+
raise AttributeError("Cannot use .str.rstrip with 'bytes' values")
27392786
result = str_strip(self._parent, to_strip, side='right')
27402787
return self._wrap_result(result)
27412788

27422789
@copy(str_wrap)
27432790
def wrap(self, width, **kwargs):
2791+
if self._inferred_type in ['bytes']:
2792+
raise AttributeError("Cannot use .str.wrap with 'bytes' values")
27442793
result = str_wrap(self._parent, width, **kwargs)
27452794
return self._wrap_result(result)
27462795

27472796
@copy(str_get_dummies)
27482797
def get_dummies(self, sep='|'):
2798+
if self._inferred_type in ['bytes']:
2799+
raise AttributeError("Cannot use .str.get_dummies with "
2800+
"'bytes' values")
27492801
# we need to cast to Series of strings as only that has all
27502802
# methods available for making the dummies...
27512803
data = self._orig.astype(str) if self._is_categorical else self._parent
@@ -2755,6 +2807,9 @@ def get_dummies(self, sep='|'):
27552807

27562808
@copy(str_translate)
27572809
def translate(self, table, deletechars=None):
2810+
if self._inferred_type in ['bytes']:
2811+
raise AttributeError("Cannot use .str.translate with "
2812+
"'bytes' values")
27582813
result = str_translate(self._parent, table, deletechars)
27592814
return self._wrap_result(result)
27602815

@@ -2765,10 +2820,15 @@ def translate(self, table, deletechars=None):
27652820

27662821
@copy(str_extract)
27672822
def extract(self, pat, flags=0, expand=True):
2823+
if self._inferred_type in ['bytes']:
2824+
raise AttributeError("Cannot use .str.extract with 'bytes' values")
27682825
return str_extract(self, pat, flags=flags, expand=expand)
27692826

27702827
@copy(str_extractall)
27712828
def extractall(self, pat, flags=0):
2829+
if self._inferred_type in ['bytes']:
2830+
raise AttributeError("Cannot use .str.extractall with "
2831+
"'bytes' values")
27722832
return str_extractall(self._orig, pat, flags=flags)
27732833

27742834
_shared_docs['find'] = ("""
@@ -2798,13 +2858,17 @@ def extractall(self, pat, flags=0):
27982858
dict(side='lowest', method='find',
27992859
also='rfind : Return highest indexes in each strings'))
28002860
def find(self, sub, start=0, end=None):
2861+
if self._inferred_type in ['bytes']:
2862+
raise AttributeError("Cannot use .str.find with 'bytes' values")
28012863
result = str_find(self._parent, sub, start=start, end=end, side='left')
28022864
return self._wrap_result(result)
28032865

28042866
@Appender(_shared_docs['find'] %
28052867
dict(side='highest', method='rfind',
28062868
also='find : Return lowest indexes in each strings'))
28072869
def rfind(self, sub, start=0, end=None):
2870+
if self._inferred_type in ['bytes']:
2871+
raise AttributeError("Cannot use .str.rfind with 'bytes' values")
28082872
result = str_find(self._parent, sub,
28092873
start=start, end=end, side='right')
28102874
return self._wrap_result(result)
@@ -2824,6 +2888,9 @@ def normalize(self, form):
28242888
normalized : Series/Index of objects
28252889
"""
28262890
import unicodedata
2891+
if self._inferred_type in ['bytes']:
2892+
raise AttributeError("Cannot use .str.normalize with "
2893+
"'bytes' values")
28272894
f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
28282895
result = _na_map(f, self._parent)
28292896
return self._wrap_result(result)
@@ -2856,6 +2923,8 @@ def normalize(self, form):
28562923
dict(side='lowest', similar='find', method='index',
28572924
also='rindex : Return highest indexes in each strings'))
28582925
def index(self, sub, start=0, end=None):
2926+
if self._inferred_type in ['bytes']:
2927+
raise AttributeError("Cannot use .str.index with 'bytes' values")
28592928
result = str_index(self._parent, sub,
28602929
start=start, end=end, side='left')
28612930
return self._wrap_result(result)
@@ -2864,6 +2933,8 @@ def index(self, sub, start=0, end=None):
28642933
dict(side='highest', similar='rfind', method='rindex',
28652934
also='index : Return lowest indexes in each strings'))
28662935
def rindex(self, sub, start=0, end=None):
2936+
if self._inferred_type in ['bytes']:
2937+
raise AttributeError("Cannot use .str.rindex with 'bytes' values")
28672938
result = str_index(self._parent, sub,
28682939
start=start, end=end, side='right')
28692940
return self._wrap_result(result)

pandas/tests/test_strings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3130,7 +3130,7 @@ def test_method_on_bytes(self):
31303130
lhs = Series(np.array(list('abc'), 'S1').astype(object))
31313131
rhs = Series(np.array(list('def'), 'S1').astype(object))
31323132
if compat.PY3:
3133-
pytest.raises(TypeError, lhs.str.cat, rhs, sep=',')
3133+
pytest.raises(AttributeError, lhs.str.cat, rhs)
31343134
else:
31353135
result = lhs.str.cat(rhs)
31363136
expected = Series(np.array(

0 commit comments

Comments
 (0)