Disable bytes on all methods except encode/decode

h-vetinari · h-vetinari · commit 37feec1161b2 · 2018-11-02T16:32:41.000+01:00
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -1759,14 +1759,23 @@ def wrapper(self):
 
 def _pat_wrapper(f, flags=False, na=False, **kwargs):
     def wrapper1(self, pat):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.{} with 'bytes' "
+                                 "values".format(f.__name__))
         result = f(self._parent, pat)
         return self._wrap_result(result)
 
     def wrapper2(self, pat, flags=0, **kwargs):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.{} with 'bytes' "
+                                 "values".format(f.__name__))
         result = f(self._parent, pat, flags=flags, **kwargs)
         return self._wrap_result(result)
 
     def wrapper3(self, pat, na=np.nan):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.{} with 'bytes' "
+                                 "values".format(f.__name__))
         result = f(self._parent, pat, na=na)
         return self._wrap_result(result)
 
@@ -1803,7 +1812,7 @@ class StringMethods(NoNewAttributesMixin):
     """
 
     def __init__(self, data):
-        self._validate(data)
+        self._inferred_type = self._validate(data)
         self._is_categorical = is_categorical_dtype(data)
 
         # .values.categories works for both Series/Index
@@ -1818,18 +1827,18 @@ def _validate(data):
                 raise AttributeError('Can only use .str accessor with Index, '
                                      'not MultiIndex')
 
-        # see src/inference.pyx which can contain string values
+        # see _libs/lib.pyx for list of inferred types
         allowed_types = ['string', 'unicode', 'empty',
                          'mixed', 'mixed-integer']
         if isinstance(data, ABCSeries):
+            # needed for str.decode
             allowed_types = allowed_types + ['bytes']
 
-        data = data.dropna()  # missing values mess up type inference
         values = getattr(data, 'values', data)  # Series / Index
         values = getattr(values, 'categories', values)  # categorical / normal
-        inferred_type = lib.infer_dtype(values)
+        # missing values mess up type inference -> skip
+        inferred_type = lib.infer_dtype(values, skipna=True)
 
-        # same for Series and Index (that is not MultiIndex)
         if inferred_type not in allowed_types:
             # it's neither a string series/index not a categorical series/index
             # with strings inside the categories.
@@ -1838,7 +1847,8 @@ def _validate(data):
             # have a str dtype (GH 9343 / 13877)
             raise AttributeError("Can only use .str accessor with string "
                                  "values (i.e. inferred_type is 'string', "
-                                 "'unicode' or 'mixed')")
+                                 "'unicode', 'mixed' or 'empty')")
+        return inferred_type
 
     def __getitem__(self, key):
         if isinstance(key, slice):
@@ -2188,6 +2198,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
         """
         from pandas import Index, Series, concat
 
+        if self._inferred_type in ['mixed', 'mixed-integer', 'bytes']:
+            raise AttributeError("Can only use .str.cat with string values "
+                                 "(i.e. inferred_type is 'string', 'unicode' "
+                                 "'empty')")
+
         if isinstance(others, compat.string_types):
             raise ValueError("Did you mean to supply a `sep` keyword?")
         if sep is None:
@@ -2396,13 +2411,17 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
         'side': 'beginning',
         'method': 'split'})
     def split(self, pat=None, n=-1, expand=False):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.split with 'bytes' values")
         result = str_split(self._parent, pat, n=n)
         return self._wrap_result(result, expand=expand)
 
     @Appender(_shared_docs['str_split'] % {
         'side': 'end',
         'method': 'rsplit'})
     def rsplit(self, pat=None, n=-1, expand=False):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.rsplit with 'bytes' values")
         result = str_rsplit(self._parent, pat, n=n)
         return self._wrap_result(result, expand=expand)
 
@@ -2493,6 +2512,9 @@ def rsplit(self, pat=None, n=-1, expand=False):
         'also': 'rpartition : Split the string at the last occurrence of `sep`'
     })
     def partition(self, pat=' ', expand=True):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.partition with "
+                                 "'bytes' values")
         f = lambda x: x.partition(pat)
         result = _na_map(f, self._parent)
         return self._wrap_result(result, expand=expand)
@@ -2504,6 +2526,9 @@ def partition(self, pat=' ', expand=True):
         'also': 'partition : Split the string at the first occurrence of `sep`'
     })
     def rpartition(self, pat=' ', expand=True):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.rpartition with "
+                                 "'bytes' values")
         f = lambda x: x.rpartition(pat)
         result = _na_map(f, self._parent)
         return self._wrap_result(result, expand=expand)
@@ -2515,6 +2540,8 @@ def get(self, i):
 
     @copy(str_join)
     def join(self, sep):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.join with 'bytes' values")
         result = str_join(self._parent, sep)
         return self._wrap_result(result)
 
@@ -2565,14 +2592,20 @@ def pad(self, width, side='left', fillchar=' '):
     @Appender(_shared_docs['str_pad'] % dict(side='left and right',
                                              method='center'))
     def center(self, width, fillchar=' '):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.center with 'bytes' values")
         return self.pad(width, side='both', fillchar=fillchar)
 
     @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust'))
     def ljust(self, width, fillchar=' '):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.ljust with 'bytes' values")
         return self.pad(width, side='right', fillchar=fillchar)
 
     @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust'))
     def rjust(self, width, fillchar=' '):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.rjust with 'bytes' values")
         return self.pad(width, side='left', fillchar=fillchar)
 
     def zfill(self, width):
@@ -2635,21 +2668,29 @@ def zfill(self, width):
         4     NaN
         dtype: object
         """
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.zfill with 'bytes' values")
         result = str_pad(self._parent, width, side='left', fillchar='0')
         return self._wrap_result(result)
 
     @copy(str_slice)
     def slice(self, start=None, stop=None, step=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.slice with 'bytes' values")
         result = str_slice(self._parent, start, stop, step)
         return self._wrap_result(result)
 
     @copy(str_slice_replace)
     def slice_replace(self, start=None, stop=None, repl=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.slice_replace with "
+                                 "'bytes' values")
         result = str_slice_replace(self._parent, start, stop, repl)
         return self._wrap_result(result)
 
     @copy(str_decode)
     def decode(self, encoding, errors="strict"):
+        # need to allow bytes here
         result = str_decode(self._parent, encoding, errors)
         return self._wrap_result(result)
 
@@ -2724,28 +2765,39 @@ def encode(self, encoding, errors="strict"):
     @Appender(_shared_docs['str_strip'] % dict(side='left and right sides',
                                                method='strip'))
     def strip(self, to_strip=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.strip with 'bytes' values")
         result = str_strip(self._parent, to_strip, side='both')
         return self._wrap_result(result)
 
     @Appender(_shared_docs['str_strip'] % dict(side='left side',
                                                method='lstrip'))
     def lstrip(self, to_strip=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.lstrip with 'bytes' values")
         result = str_strip(self._parent, to_strip, side='left')
         return self._wrap_result(result)
 
     @Appender(_shared_docs['str_strip'] % dict(side='right side',
                                                method='rstrip'))
     def rstrip(self, to_strip=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.rstrip with 'bytes' values")
         result = str_strip(self._parent, to_strip, side='right')
         return self._wrap_result(result)
 
     @copy(str_wrap)
     def wrap(self, width, **kwargs):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.wrap with 'bytes' values")
         result = str_wrap(self._parent, width, **kwargs)
         return self._wrap_result(result)
 
     @copy(str_get_dummies)
     def get_dummies(self, sep='|'):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.get_dummies with "
+                                 "'bytes' values")
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
         data = self._orig.astype(str) if self._is_categorical else self._parent
@@ -2755,6 +2807,9 @@ def get_dummies(self, sep='|'):
 
     @copy(str_translate)
     def translate(self, table, deletechars=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.translate with "
+                                 "'bytes' values")
         result = str_translate(self._parent, table, deletechars)
         return self._wrap_result(result)
 
@@ -2765,10 +2820,15 @@ def translate(self, table, deletechars=None):
 
     @copy(str_extract)
     def extract(self, pat, flags=0, expand=True):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.extract with 'bytes' values")
         return str_extract(self, pat, flags=flags, expand=expand)
 
     @copy(str_extractall)
     def extractall(self, pat, flags=0):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.extractall with "
+                                 "'bytes' values")
         return str_extractall(self._orig, pat, flags=flags)
 
     _shared_docs['find'] = ("""
@@ -2798,13 +2858,17 @@ def extractall(self, pat, flags=0):
               dict(side='lowest', method='find',
                    also='rfind : Return highest indexes in each strings'))
     def find(self, sub, start=0, end=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.find with 'bytes' values")
         result = str_find(self._parent, sub, start=start, end=end, side='left')
         return self._wrap_result(result)
 
     @Appender(_shared_docs['find'] %
               dict(side='highest', method='rfind',
                    also='find : Return lowest indexes in each strings'))
     def rfind(self, sub, start=0, end=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.rfind with 'bytes' values")
         result = str_find(self._parent, sub,
                           start=start, end=end, side='right')
         return self._wrap_result(result)
@@ -2824,6 +2888,9 @@ def normalize(self, form):
         normalized : Series/Index of objects
         """
         import unicodedata
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.normalize with "
+                                 "'bytes' values")
         f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
         result = _na_map(f, self._parent)
         return self._wrap_result(result)
@@ -2856,6 +2923,8 @@ def normalize(self, form):
               dict(side='lowest', similar='find', method='index',
                    also='rindex : Return highest indexes in each strings'))
     def index(self, sub, start=0, end=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.index with 'bytes' values")
         result = str_index(self._parent, sub,
                            start=start, end=end, side='left')
         return self._wrap_result(result)
@@ -2864,6 +2933,8 @@ def index(self, sub, start=0, end=None):
               dict(side='highest', similar='rfind', method='rindex',
                    also='index : Return lowest indexes in each strings'))
     def rindex(self, sub, start=0, end=None):
+        if self._inferred_type in ['bytes']:
+            raise AttributeError("Cannot use .str.rindex with 'bytes' values")
         result = str_index(self._parent, sub,
                            start=start, end=end, side='right')
         return self._wrap_result(result)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -3130,7 +3130,7 @@ def test_method_on_bytes(self):
         lhs = Series(np.array(list('abc'), 'S1').astype(object))
         rhs = Series(np.array(list('def'), 'S1').astype(object))
         if compat.PY3:
-            pytest.raises(TypeError, lhs.str.cat, rhs, sep=',')
+            pytest.raises(AttributeError, lhs.str.cat, rhs)
         else:
             result = lhs.str.cat(rhs)
             expected = Series(np.array(