Review (jreback)

h-vetinari · h-vetinari · commit 173514f54e05 · 2018-11-02T16:32:41.000+01:00
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -8,7 +8,6 @@
     ensure_object,
     is_bool_dtype,
     is_categorical_dtype,
-    is_object_dtype,
     is_string_like,
     is_list_like,
     is_scalar,
@@ -1815,8 +1814,6 @@ def __init__(self, data):
 
     @staticmethod
     def _validate(data):
-        from pandas.core.index import Index
-
         if isinstance(data, ABCMultiIndex):
                 raise AttributeError('Can only use .str accessor with Index, '
                                      'not MultiIndex')
@@ -1827,16 +1824,13 @@ def _validate(data):
         if isinstance(data, ABCSeries):
             allowed_types = allowed_types + ['bytes']
 
-        values = data if isinstance(data, Index) else data.values
-        if is_categorical_dtype(data.dtype):
-            inf_type = lib.infer_dtype(values.categories)
-        else:
-            inf_type = lib.infer_dtype(values)
-
-        all_na_obj = is_object_dtype(values.dtype) and data.isna().all()
+        data = data.dropna()  # missing values mess up type inference
+        values = getattr(data, 'values', data)  # Series / Index
+        values = getattr(values, 'categories', values)  # categorical / normal
+        inferred_type = lib.infer_dtype(values)
 
         # same for Series and Index (that is not MultiIndex)
-        if inf_type not in allowed_types and not all_na_obj:
+        if inferred_type not in allowed_types:
             # it's neither a string series/index not a categorical series/index
             # with strings inside the categories.
             # this really should exclude all series/index with any non-string
@@ -2275,9 +2269,10 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
 
         if isinstance(self._orig, Index):
             # add dtype for case that result is all-NA
-            result = Index(result, dtype='object', name=self._orig.name)
+            result = Index(result, dtype=object, name=self._orig.name)
         else:  # Series
-            result = Series(result, index=data.index, name=self._orig.name)
+            result = Series(result, dtype=object, index=data.index,
+                            name=self._orig.name)
         return result
 
     _shared_docs['str_split'] = ("""