From 68502cf2bf9fc0f17441593c49aadd34e6c6ebd3 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Sat, 2 May 2020 09:56:40 -0700
Subject: [PATCH 1/4] PERF: use fastpath for is_categorical_dtype

---
 pandas/core/indexes/accessors.py          |  2 +-
 pandas/core/indexes/base.py               |  3 +++
 pandas/core/indexes/multi.py              |  2 +-
 pandas/core/internals/blocks.py           |  5 ++++-
 pandas/io/pytables.py                     |  6 +++---
 pandas/io/stata.py                        |  2 +-
 pandas/tests/base/test_misc.py            |  4 ++--
 pandas/tests/frame/test_alter_axes.py     |  4 ++--
 pandas/tests/indexing/test_categorical.py | 22 +++++++++++-----------
 pandas/tests/io/test_stata.py             |  4 ++--
 pandas/tests/reshape/merge/test_merge.py  |  2 +-
 pandas/tests/series/test_constructors.py  |  4 ++--
 12 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py
index db2264db438f4..598d228723ac8 100644
--- a/pandas/core/indexes/accessors.py
+++ b/pandas/core/indexes/accessors.py
@@ -434,7 +434,7 @@ def __new__(cls, data: "Series"):
                 f"cannot convert an object of type {type(data)} to a datetimelike index"
             )
 
-        orig = data if is_categorical_dtype(data) else None
+        orig = data if is_categorical_dtype(data.dtype) else None
         if orig is not None:
             data = data._constructor(
                 orig.array,
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 79af28dc5f2ce..cf17ce9db6b1a 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -631,6 +631,9 @@ def astype(self, dtype, copy=True):
         Index
             Index with values cast to specified dtype.
         """
+        if dtype is not None:
+            dtype = pandas_dtype(dtype)
+
         if is_dtype_equal(self.dtype, dtype):
             return self.copy() if copy else self
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 86110433b36b1..72369a13b150f 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -651,7 +651,7 @@ def values(self):
 
         for i in range(self.nlevels):
             vals = self._get_level_values(i)
-            if is_categorical_dtype(vals):
+            if is_categorical_dtype(vals.dtype):
                 vals = vals._internal_get_values()
             if isinstance(vals.dtype, ExtensionDtype) or isinstance(
                 vals, (ABCDatetimeIndex, ABCTimedeltaIndex)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 39df7ae3c9b60..d028d840448ea 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -535,10 +535,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
             )
             raise TypeError(msg)
 
+        if dtype is not None:
+            dtype = pandas_dtype(dtype)
+
         # may need to convert to categorical
         if is_categorical_dtype(dtype):
 
-            if is_categorical_dtype(self.values):
+            if is_categorical_dtype(self.values.dtype):
                 # GH 10696/18593: update an existing categorical efficiently
                 return self.make_block(self.values.astype(dtype, copy=copy))
 
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 5845202550326..80d02b06ffb23 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -2213,7 +2213,7 @@ def take_data(self):
         return self.data
 
     @classmethod
-    def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col":
+    def _get_atom(cls, values: ArrayLike) -> "Col":
         """
         Get an appropriately typed and shaped pytables.Col object for values.
         """
@@ -2887,7 +2887,7 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None)
         empty_array = value.size == 0
         transposed = False
 
-        if is_categorical_dtype(value):
+        if is_categorical_dtype(value.dtype):
             raise NotImplementedError(
                 "Cannot store a category dtype in a HDF5 dataset that uses format="
                 '"fixed". Use format="table".'
@@ -3795,7 +3795,7 @@ def get_blk_items(mgr, blocks):
             tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None
 
             meta = metadata = ordered = None
-            if is_categorical_dtype(data_converted):
+            if is_categorical_dtype(data_converted.dtype):
                 ordered = data_converted.ordered
                 meta = "category"
                 metadata = np.array(data_converted.categories, copy=False).ravel()
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index b9b43685415d1..f445f05c2ee05 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -2132,7 +2132,7 @@ def _prepare_categoricals(self, data: DataFrame) -> DataFrame:
         Check for categorical columns, retain categorical information for
         Stata file and convert categorical data to int
         """
-        is_cat = [is_categorical_dtype(data[col]) for col in data]
+        is_cat = [is_categorical_dtype(data[col].dtype) for col in data]
         self._is_col_cat = is_cat
         self._value_labels: List[StataValueLabel] = []
         if not any(is_cat):
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 6bab60f05ce89..72417d3afd579 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -122,8 +122,8 @@ def test_memory_usage(index_or_series_obj):
     is_object = is_object_dtype(obj) or (
         isinstance(obj, Series) and is_object_dtype(obj.index)
     )
-    is_categorical = is_categorical_dtype(obj) or (
-        isinstance(obj, Series) and is_categorical_dtype(obj.index)
+    is_categorical = is_categorical_dtype(obj.dtype) or (
+        isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype)
     )
 
     if len(obj) == 0:
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
index cd23cd6aa9c63..486855f5c37cd 100644
--- a/pandas/tests/frame/test_alter_axes.py
+++ b/pandas/tests/frame/test_alter_axes.py
@@ -234,9 +234,9 @@ def test_setitem(self):
         df["D"] = s.values
         df["E"] = np.array(s.values)
 
-        assert is_categorical_dtype(df["B"])
+        assert is_categorical_dtype(df["B"].dtype)
         assert is_interval_dtype(df["B"].cat.categories)
-        assert is_categorical_dtype(df["D"])
+        assert is_categorical_dtype(df["D"].dtype)
         assert is_interval_dtype(df["D"].cat.categories)
 
         assert is_object_dtype(df["C"])
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index c9634c4c90809..98edb56260b01 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -156,7 +156,7 @@ def test_slicing_and_getting_ops(self):
         # frame
         res_df = df.iloc[2:4, :]
         tm.assert_frame_equal(res_df, exp_df)
-        assert is_categorical_dtype(res_df["cats"])
+        assert is_categorical_dtype(res_df["cats"].dtype)
 
         # row
         res_row = df.iloc[2, :]
@@ -166,7 +166,7 @@ def test_slicing_and_getting_ops(self):
         # col
         res_col = df.iloc[:, 0]
         tm.assert_series_equal(res_col, exp_col)
-        assert is_categorical_dtype(res_col)
+        assert is_categorical_dtype(res_col.dtype)
 
         # single value
         res_val = df.iloc[2, 0]
@@ -176,7 +176,7 @@ def test_slicing_and_getting_ops(self):
         # frame
         res_df = df.loc["j":"k", :]
         tm.assert_frame_equal(res_df, exp_df)
-        assert is_categorical_dtype(res_df["cats"])
+        assert is_categorical_dtype(res_df["cats"].dtype)
 
         # row
         res_row = df.loc["j", :]
@@ -186,7 +186,7 @@ def test_slicing_and_getting_ops(self):
         # col
         res_col = df.loc[:, "cats"]
         tm.assert_series_equal(res_col, exp_col)
-        assert is_categorical_dtype(res_col)
+        assert is_categorical_dtype(res_col.dtype)
 
         # single value
         res_val = df.loc["j", "cats"]
@@ -197,7 +197,7 @@ def test_slicing_and_getting_ops(self):
         # res_df = df.loc["j":"k",[0,1]] # doesn't work?
         res_df = df.loc["j":"k", :]
         tm.assert_frame_equal(res_df, exp_df)
-        assert is_categorical_dtype(res_df["cats"])
+        assert is_categorical_dtype(res_df["cats"].dtype)
 
         # row
         res_row = df.loc["j", :]
@@ -207,7 +207,7 @@ def test_slicing_and_getting_ops(self):
         # col
         res_col = df.loc[:, "cats"]
         tm.assert_series_equal(res_col, exp_col)
-        assert is_categorical_dtype(res_col)
+        assert is_categorical_dtype(res_col.dtype)
 
         # single value
         res_val = df.loc["j", df.columns[0]]
@@ -240,23 +240,23 @@ def test_slicing_and_getting_ops(self):
 
         res_df = df.iloc[slice(2, 4)]
         tm.assert_frame_equal(res_df, exp_df)
-        assert is_categorical_dtype(res_df["cats"])
+        assert is_categorical_dtype(res_df["cats"].dtype)
 
         res_df = df.iloc[[2, 3]]
         tm.assert_frame_equal(res_df, exp_df)
-        assert is_categorical_dtype(res_df["cats"])
+        assert is_categorical_dtype(res_df["cats"].dtype)
 
         res_col = df.iloc[:, 0]
         tm.assert_series_equal(res_col, exp_col)
-        assert is_categorical_dtype(res_col)
+        assert is_categorical_dtype(res_col.dtype)
 
         res_df = df.iloc[:, slice(0, 2)]
         tm.assert_frame_equal(res_df, df)
-        assert is_categorical_dtype(res_df["cats"])
+        assert is_categorical_dtype(res_df["cats"].dtype)
 
         res_df = df.iloc[:, [0, 1]]
         tm.assert_frame_equal(res_df, df)
-        assert is_categorical_dtype(res_df["cats"])
+        assert is_categorical_dtype(res_df["cats"].dtype)
 
     def test_slicing_doc_examples(self):
 
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index eaa92fa53d799..6839e3ed0bbea 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1065,7 +1065,7 @@ def test_categorical_order(self, file):
 
         # Check identity of codes
         for col in expected:
-            if is_categorical_dtype(expected[col]):
+            if is_categorical_dtype(expected[col].dtype):
                 tm.assert_series_equal(expected[col].cat.codes, parsed[col].cat.codes)
                 tm.assert_index_equal(
                     expected[col].cat.categories, parsed[col].cat.categories
@@ -1095,7 +1095,7 @@ def test_categorical_ordering(self, file):
 
         parsed_unordered = read_stata(file, order_categoricals=False)
         for col in parsed:
-            if not is_categorical_dtype(parsed[col]):
+            if not is_categorical_dtype(parsed[col].dtype):
                 continue
             assert parsed[col].cat.ordered
             assert not parsed_unordered[col].cat.ordered
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index a92e628960456..4408aa0bbce4a 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -1729,7 +1729,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):
 
         X = change(right.X.astype("object"))
         right = right.assign(X=X)
-        assert is_categorical_dtype(left.X.values)
+        assert is_categorical_dtype(left.X.values.dtype)
         # assert not left.X.values.is_dtype_equal(right.X.values)
 
         merged = pd.merge(left, right, on="X", how=join_type)
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 85f47d0f6f5a4..d78324d92a036 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -377,12 +377,12 @@ def test_constructor_categorical_dtype(self):
         result = pd.Series(
             ["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True)
         )
-        assert is_categorical_dtype(result) is True
+        assert is_categorical_dtype(result.dtype) is True
         tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"]))
         assert result.cat.ordered
 
         result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"]))
-        assert is_categorical_dtype(result)
+        assert is_categorical_dtype(result.dtype)
         tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"]))
         assert result.cat.ordered is False
 

From 6517c7e4abfb97cf2c3cf229e27be874244e1414 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Sat, 2 May 2020 10:00:08 -0700
Subject: [PATCH 2/4] use fastpath

---
 pandas/core/base.py        | 2 +-
 pandas/core/groupby/ops.py | 2 +-
 pandas/core/strings.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/base.py b/pandas/core/base.py
index 7ea2ff95ea0de..309b6e0ad5e1a 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -1123,7 +1123,7 @@ def _map_values(self, mapper, na_action=None):
         if isinstance(mapper, ABCSeries):
             # Since values were input this means we came from either
             # a dict or a series and mapper should be an index
-            if is_categorical_dtype(self._values):
+            if is_categorical_dtype(self.dtype):
                 # use the built in categorical series mapper which saves
                 # time by mapping the categories instead of all values
                 return self._values.map(mapper)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index d67811988d0f8..71d7a07aadf7f 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -458,7 +458,7 @@ def _cython_operation(
 
         # categoricals are only 1d, so we
         # are not setup for dim transforming
-        if is_categorical_dtype(values) or is_sparse(values):
+        if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
             raise NotImplementedError(f"{values.dtype} dtype not supported")
         elif is_datetime64_any_dtype(values):
             if how in ["add", "prod", "cumsum", "cumprod"]:
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 76b851d8ac923..72d778524a364 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2092,7 +2092,7 @@ class StringMethods(NoNewAttributesMixin):
 
     def __init__(self, data):
         self._inferred_dtype = self._validate(data)
-        self._is_categorical = is_categorical_dtype(data)
+        self._is_categorical = is_categorical_dtype(data.dtype)
         self._is_string = data.dtype.name == "string"
 
         # ._values.categories works for both Series/Index

From d0ab3a0af85b0b7daff18a39c3d63bcf0be02046 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Sat, 2 May 2020 11:47:15 -0700
Subject: [PATCH 3/4] clean

---
 pandas/_testing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_testing.py b/pandas/_testing.py
index 74c4c661b4b83..8fbdcb89dafca 100644
--- a/pandas/_testing.py
+++ b/pandas/_testing.py
@@ -718,7 +718,7 @@ def _get_ilevel_values(index, level):
         assert_interval_array_equal(left._values, right._values)
 
     if check_categorical:
-        if is_categorical_dtype(left) or is_categorical_dtype(right):
+        if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
             assert_categorical_equal(left._values, right._values, obj=f"{obj} category")
 
 
@@ -1250,7 +1250,7 @@ def assert_series_equal(
         assert_attr_equal("name", left, right, obj=obj)
 
     if check_categorical:
-        if is_categorical_dtype(left) or is_categorical_dtype(right):
+        if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
             assert_categorical_equal(
                 left._values,
                 right._values,

From f7fee01e89b66d8c3961b154be22520af47a8bce Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Sat, 2 May 2020 13:10:08 -0700
Subject: [PATCH 4/4] mypy fixup

---
 pandas/io/pytables.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 80d02b06ffb23..82380d456cd6d 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -2218,14 +2218,15 @@ def _get_atom(cls, values: ArrayLike) -> "Col":
         Get an appropriately typed and shaped pytables.Col object for values.
         """
         dtype = values.dtype
-        itemsize = dtype.itemsize
+        itemsize = dtype.itemsize  # type: ignore
 
         shape = values.shape
         if values.ndim == 1:
             # EA, use block shape pretending it is 2D
+            # TODO(EA2D): not necessary with 2D EAs
             shape = (1, values.size)
 
-        if is_categorical_dtype(dtype):
+        if isinstance(values, Categorical):
             codes = values.codes
             atom = cls.get_atom_data(shape, kind=codes.dtype.name)
         elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):