From 68502cf2bf9fc0f17441593c49aadd34e6c6ebd3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 2 May 2020 09:56:40 -0700 Subject: [PATCH 1/4] PERF: use fastpath for is_categorical_dtype --- pandas/core/indexes/accessors.py | 2 +- pandas/core/indexes/base.py | 3 +++ pandas/core/indexes/multi.py | 2 +- pandas/core/internals/blocks.py | 5 ++++- pandas/io/pytables.py | 6 +++--- pandas/io/stata.py | 2 +- pandas/tests/base/test_misc.py | 4 ++-- pandas/tests/frame/test_alter_axes.py | 4 ++-- pandas/tests/indexing/test_categorical.py | 22 +++++++++++----------- pandas/tests/io/test_stata.py | 4 ++-- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/series/test_constructors.py | 4 ++-- 12 files changed, 33 insertions(+), 27 deletions(-) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index db2264db438f4..598d228723ac8 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -434,7 +434,7 @@ def __new__(cls, data: "Series"): f"cannot convert an object of type {type(data)} to a datetimelike index" ) - orig = data if is_categorical_dtype(data) else None + orig = data if is_categorical_dtype(data.dtype) else None if orig is not None: data = data._constructor( orig.array, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 79af28dc5f2ce..cf17ce9db6b1a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -631,6 +631,9 @@ def astype(self, dtype, copy=True): Index Index with values cast to specified dtype. """ + if dtype is not None: + dtype = pandas_dtype(dtype) + if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 86110433b36b1..72369a13b150f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -651,7 +651,7 @@ def values(self): for i in range(self.nlevels): vals = self._get_level_values(i) - if is_categorical_dtype(vals): + if is_categorical_dtype(vals.dtype): vals = vals._internal_get_values() if isinstance(vals.dtype, ExtensionDtype) or isinstance( vals, (ABCDatetimeIndex, ABCTimedeltaIndex) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 39df7ae3c9b60..d028d840448ea 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -535,10 +535,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ) raise TypeError(msg) + if dtype is not None: + dtype = pandas_dtype(dtype) + # may need to convert to categorical if is_categorical_dtype(dtype): - if is_categorical_dtype(self.values): + if is_categorical_dtype(self.values.dtype): # GH 10696/18593: update an existing categorical efficiently return self.make_block(self.values.astype(dtype, copy=copy)) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5845202550326..80d02b06ffb23 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2213,7 +2213,7 @@ def take_data(self): return self.data @classmethod - def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col": + def _get_atom(cls, values: ArrayLike) -> "Col": """ Get an appropriately typed and shaped pytables.Col object for values. """ @@ -2887,7 +2887,7 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None) empty_array = value.size == 0 transposed = False - if is_categorical_dtype(value): + if is_categorical_dtype(value.dtype): raise NotImplementedError( "Cannot store a category dtype in a HDF5 dataset that uses format=" '"fixed". Use format="table".' @@ -3795,7 +3795,7 @@ def get_blk_items(mgr, blocks): tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None meta = metadata = ordered = None - if is_categorical_dtype(data_converted): + if is_categorical_dtype(data_converted.dtype): ordered = data_converted.ordered meta = "category" metadata = np.array(data_converted.categories, copy=False).ravel() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b9b43685415d1..f445f05c2ee05 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2132,7 +2132,7 @@ def _prepare_categoricals(self, data: DataFrame) -> DataFrame: Check for categorical columns, retain categorical information for Stata file and convert categorical data to int """ - is_cat = [is_categorical_dtype(data[col]) for col in data] + is_cat = [is_categorical_dtype(data[col].dtype) for col in data] self._is_col_cat = is_cat self._value_labels: List[StataValueLabel] = [] if not any(is_cat): diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 6bab60f05ce89..72417d3afd579 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -122,8 +122,8 @@ def test_memory_usage(index_or_series_obj): is_object = is_object_dtype(obj) or ( isinstance(obj, Series) and is_object_dtype(obj.index) ) - is_categorical = is_categorical_dtype(obj) or ( - isinstance(obj, Series) and is_categorical_dtype(obj.index) + is_categorical = is_categorical_dtype(obj.dtype) or ( + isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype) ) if len(obj) == 0: diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index cd23cd6aa9c63..486855f5c37cd 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -234,9 +234,9 @@ def test_setitem(self): df["D"] = s.values df["E"] = np.array(s.values) - assert is_categorical_dtype(df["B"]) + assert is_categorical_dtype(df["B"].dtype) assert is_interval_dtype(df["B"].cat.categories) - assert is_categorical_dtype(df["D"]) + assert is_categorical_dtype(df["D"].dtype) assert is_interval_dtype(df["D"].cat.categories) assert is_object_dtype(df["C"]) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index c9634c4c90809..98edb56260b01 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -156,7 +156,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) - assert is_categorical_dtype(res_df["cats"]) + assert is_categorical_dtype(res_df["cats"].dtype) # row res_row = df.iloc[2, :] @@ -166,7 +166,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - assert is_categorical_dtype(res_col) + assert is_categorical_dtype(res_col.dtype) # single value res_val = df.iloc[2, 0] @@ -176,7 +176,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - assert is_categorical_dtype(res_df["cats"]) + assert is_categorical_dtype(res_df["cats"].dtype) # row res_row = df.loc["j", :] @@ -186,7 +186,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) - assert is_categorical_dtype(res_col) + assert is_categorical_dtype(res_col.dtype) # single value res_val = df.loc["j", "cats"] @@ -197,7 +197,7 @@ def test_slicing_and_getting_ops(self): # res_df = df.loc["j":"k",[0,1]] # doesn't work? res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - assert is_categorical_dtype(res_df["cats"]) + assert is_categorical_dtype(res_df["cats"].dtype) # row res_row = df.loc["j", :] @@ -207,7 +207,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) - assert is_categorical_dtype(res_col) + assert is_categorical_dtype(res_col.dtype) # single value res_val = df.loc["j", df.columns[0]] @@ -240,23 +240,23 @@ def test_slicing_and_getting_ops(self): res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) - assert is_categorical_dtype(res_df["cats"]) + assert is_categorical_dtype(res_df["cats"].dtype) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) - assert is_categorical_dtype(res_df["cats"]) + assert is_categorical_dtype(res_df["cats"].dtype) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - assert is_categorical_dtype(res_col) + assert is_categorical_dtype(res_col.dtype) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) - assert is_categorical_dtype(res_df["cats"]) + assert is_categorical_dtype(res_df["cats"].dtype) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) - assert is_categorical_dtype(res_df["cats"]) + assert is_categorical_dtype(res_df["cats"].dtype) def test_slicing_doc_examples(self): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index eaa92fa53d799..6839e3ed0bbea 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1065,7 +1065,7 @@ def test_categorical_order(self, file): # Check identity of codes for col in expected: - if is_categorical_dtype(expected[col]): + if is_categorical_dtype(expected[col].dtype): tm.assert_series_equal(expected[col].cat.codes, parsed[col].cat.codes) tm.assert_index_equal( expected[col].cat.categories, parsed[col].cat.categories @@ -1095,7 +1095,7 @@ def test_categorical_ordering(self, file): parsed_unordered = read_stata(file, order_categoricals=False) for col in parsed: - if not is_categorical_dtype(parsed[col]): + if not is_categorical_dtype(parsed[col].dtype): continue assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a92e628960456..4408aa0bbce4a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1729,7 +1729,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): X = change(right.X.astype("object")) right = right.assign(X=X) - assert is_categorical_dtype(left.X.values) + assert is_categorical_dtype(left.X.values.dtype) # assert not left.X.values.is_dtype_equal(right.X.values) merged = pd.merge(left, right, on="X", how=join_type) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 85f47d0f6f5a4..d78324d92a036 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -377,12 +377,12 @@ def test_constructor_categorical_dtype(self): result = pd.Series( ["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True) ) - assert is_categorical_dtype(result) is True + assert is_categorical_dtype(result.dtype) is True tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"])) assert result.cat.ordered result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) - assert is_categorical_dtype(result) + assert is_categorical_dtype(result.dtype) tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"])) assert result.cat.ordered is False From 6517c7e4abfb97cf2c3cf229e27be874244e1414 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 2 May 2020 10:00:08 -0700 Subject: [PATCH 2/4] use fastpath --- pandas/core/base.py | 2 +- pandas/core/groupby/ops.py | 2 +- pandas/core/strings.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 7ea2ff95ea0de..309b6e0ad5e1a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1123,7 +1123,7 @@ def _map_values(self, mapper, na_action=None): if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index - if is_categorical_dtype(self._values): + if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values return self._values.map(mapper) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d67811988d0f8..71d7a07aadf7f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -458,7 +458,7 @@ def _cython_operation( # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values) or is_sparse(values): + if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 76b851d8ac923..72d778524a364 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2092,7 +2092,7 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): self._inferred_dtype = self._validate(data) - self._is_categorical = is_categorical_dtype(data) + self._is_categorical = is_categorical_dtype(data.dtype) self._is_string = data.dtype.name == "string" # ._values.categories works for both Series/Index From d0ab3a0af85b0b7daff18a39c3d63bcf0be02046 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 2 May 2020 11:47:15 -0700 Subject: [PATCH 3/4] clean --- pandas/_testing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 74c4c661b4b83..8fbdcb89dafca 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -718,7 +718,7 @@ def _get_ilevel_values(index, level): assert_interval_array_equal(left._values, right._values) if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): + if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): assert_categorical_equal(left._values, right._values, obj=f"{obj} category") @@ -1250,7 +1250,7 @@ def assert_series_equal( assert_attr_equal("name", left, right, obj=obj) if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): + if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): assert_categorical_equal( left._values, right._values, From f7fee01e89b66d8c3961b154be22520af47a8bce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 2 May 2020 13:10:08 -0700 Subject: [PATCH 4/4] mypy fixup --- pandas/io/pytables.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 80d02b06ffb23..82380d456cd6d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2218,14 +2218,15 @@ def _get_atom(cls, values: ArrayLike) -> "Col": Get an appropriately typed and shaped pytables.Col object for values. """ dtype = values.dtype - itemsize = dtype.itemsize + itemsize = dtype.itemsize # type: ignore shape = values.shape if values.ndim == 1: # EA, use block shape pretending it is 2D + # TODO(EA2D): not necessary with 2D EAs shape = (1, values.size) - if is_categorical_dtype(dtype): + if isinstance(values, Categorical): codes = values.codes atom = cls.get_atom_data(shape, kind=codes.dtype.name) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):