Skip to content

PERF: use fastpath for is_categorical_dtype calls #33945

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ def _get_ilevel_values(index, level):
assert_interval_array_equal(left._values, right._values)

if check_categorical:
if is_categorical_dtype(left) or is_categorical_dtype(right):
if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
assert_categorical_equal(left._values, right._values, obj=f"{obj} category")


Expand Down Expand Up @@ -1250,7 +1250,7 @@ def assert_series_equal(
assert_attr_equal("name", left, right, obj=obj)

if check_categorical:
if is_categorical_dtype(left) or is_categorical_dtype(right):
if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
assert_categorical_equal(
left._values,
right._values,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ def _map_values(self, mapper, na_action=None):
if isinstance(mapper, ABCSeries):
# Since values were input this means we came from either
# a dict or a series and mapper should be an index
if is_categorical_dtype(self._values):
if is_categorical_dtype(self.dtype):
# use the built in categorical series mapper which saves
# time by mapping the categories instead of all values
return self._values.map(mapper)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ def _cython_operation(

# categoricals are only 1d, so we
# are not setup for dim transforming
if is_categorical_dtype(values) or is_sparse(values):
if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
raise NotImplementedError(f"{values.dtype} dtype not supported")
elif is_datetime64_any_dtype(values):
if how in ["add", "prod", "cumsum", "cumprod"]:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ def __new__(cls, data: "Series"):
f"cannot convert an object of type {type(data)} to a datetimelike index"
)

orig = data if is_categorical_dtype(data) else None
orig = data if is_categorical_dtype(data.dtype) else None
if orig is not None:
data = data._constructor(
orig.array,
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,9 @@ def astype(self, dtype, copy=True):
Index
Index with values cast to specified dtype.
"""
if dtype is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is possible we need to optimize this at some point (pandas_dtype) to fast path things

dtype = pandas_dtype(dtype)

if is_dtype_equal(self.dtype, dtype):
return self.copy() if copy else self

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ def values(self):

for i in range(self.nlevels):
vals = self._get_level_values(i)
if is_categorical_dtype(vals):
if is_categorical_dtype(vals.dtype):
vals = vals._internal_get_values()
if isinstance(vals.dtype, ExtensionDtype) or isinstance(
vals, (ABCDatetimeIndex, ABCTimedeltaIndex)
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,10 +535,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
)
raise TypeError(msg)

if dtype is not None:
dtype = pandas_dtype(dtype)

# may need to convert to categorical
if is_categorical_dtype(dtype):

if is_categorical_dtype(self.values):
if is_categorical_dtype(self.values.dtype):
# GH 10696/18593: update an existing categorical efficiently
return self.make_block(self.values.astype(dtype, copy=copy))

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2092,7 +2092,7 @@ class StringMethods(NoNewAttributesMixin):

def __init__(self, data):
self._inferred_dtype = self._validate(data)
self._is_categorical = is_categorical_dtype(data)
self._is_categorical = is_categorical_dtype(data.dtype)
self._is_string = data.dtype.name == "string"

# ._values.categories works for both Series/Index
Expand Down
11 changes: 6 additions & 5 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2213,19 +2213,20 @@ def take_data(self):
return self.data

@classmethod
def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col":
def _get_atom(cls, values: ArrayLike) -> "Col":
"""
Get an appropriately typed and shaped pytables.Col object for values.
"""
dtype = values.dtype
itemsize = dtype.itemsize
itemsize = dtype.itemsize # type: ignore

shape = values.shape
if values.ndim == 1:
# EA, use block shape pretending it is 2D
# TODO(EA2D): not necessary with 2D EAs
shape = (1, values.size)

if is_categorical_dtype(dtype):
if isinstance(values, Categorical):
codes = values.codes
atom = cls.get_atom_data(shape, kind=codes.dtype.name)
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
Expand Down Expand Up @@ -2887,7 +2888,7 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None)
empty_array = value.size == 0
transposed = False

if is_categorical_dtype(value):
if is_categorical_dtype(value.dtype):
raise NotImplementedError(
"Cannot store a category dtype in a HDF5 dataset that uses format="
'"fixed". Use format="table".'
Expand Down Expand Up @@ -3795,7 +3796,7 @@ def get_blk_items(mgr, blocks):
tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None

meta = metadata = ordered = None
if is_categorical_dtype(data_converted):
if is_categorical_dtype(data_converted.dtype):
ordered = data_converted.ordered
meta = "category"
metadata = np.array(data_converted.categories, copy=False).ravel()
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2132,7 +2132,7 @@ def _prepare_categoricals(self, data: DataFrame) -> DataFrame:
Check for categorical columns, retain categorical information for
Stata file and convert categorical data to int
"""
is_cat = [is_categorical_dtype(data[col]) for col in data]
is_cat = [is_categorical_dtype(data[col].dtype) for col in data]
self._is_col_cat = is_cat
self._value_labels: List[StataValueLabel] = []
if not any(is_cat):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/base/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ def test_memory_usage(index_or_series_obj):
is_object = is_object_dtype(obj) or (
isinstance(obj, Series) and is_object_dtype(obj.index)
)
is_categorical = is_categorical_dtype(obj) or (
isinstance(obj, Series) and is_categorical_dtype(obj.index)
is_categorical = is_categorical_dtype(obj.dtype) or (
isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype)
)

if len(obj) == 0:
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ def test_setitem(self):
df["D"] = s.values
df["E"] = np.array(s.values)

assert is_categorical_dtype(df["B"])
assert is_categorical_dtype(df["B"].dtype)
assert is_interval_dtype(df["B"].cat.categories)
assert is_categorical_dtype(df["D"])
assert is_categorical_dtype(df["D"].dtype)
assert is_interval_dtype(df["D"].cat.categories)

assert is_object_dtype(df["C"])
Expand Down
22 changes: 11 additions & 11 deletions pandas/tests/indexing/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def test_slicing_and_getting_ops(self):
# frame
res_df = df.iloc[2:4, :]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"])
assert is_categorical_dtype(res_df["cats"].dtype)

# row
res_row = df.iloc[2, :]
Expand All @@ -166,7 +166,7 @@ def test_slicing_and_getting_ops(self):
# col
res_col = df.iloc[:, 0]
tm.assert_series_equal(res_col, exp_col)
assert is_categorical_dtype(res_col)
assert is_categorical_dtype(res_col.dtype)

# single value
res_val = df.iloc[2, 0]
Expand All @@ -176,7 +176,7 @@ def test_slicing_and_getting_ops(self):
# frame
res_df = df.loc["j":"k", :]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"])
assert is_categorical_dtype(res_df["cats"].dtype)

# row
res_row = df.loc["j", :]
Expand All @@ -186,7 +186,7 @@ def test_slicing_and_getting_ops(self):
# col
res_col = df.loc[:, "cats"]
tm.assert_series_equal(res_col, exp_col)
assert is_categorical_dtype(res_col)
assert is_categorical_dtype(res_col.dtype)

# single value
res_val = df.loc["j", "cats"]
Expand All @@ -197,7 +197,7 @@ def test_slicing_and_getting_ops(self):
# res_df = df.loc["j":"k",[0,1]] # doesn't work?
res_df = df.loc["j":"k", :]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"])
assert is_categorical_dtype(res_df["cats"].dtype)

# row
res_row = df.loc["j", :]
Expand All @@ -207,7 +207,7 @@ def test_slicing_and_getting_ops(self):
# col
res_col = df.loc[:, "cats"]
tm.assert_series_equal(res_col, exp_col)
assert is_categorical_dtype(res_col)
assert is_categorical_dtype(res_col.dtype)

# single value
res_val = df.loc["j", df.columns[0]]
Expand Down Expand Up @@ -240,23 +240,23 @@ def test_slicing_and_getting_ops(self):

res_df = df.iloc[slice(2, 4)]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"])
assert is_categorical_dtype(res_df["cats"].dtype)

res_df = df.iloc[[2, 3]]
tm.assert_frame_equal(res_df, exp_df)
assert is_categorical_dtype(res_df["cats"])
assert is_categorical_dtype(res_df["cats"].dtype)

res_col = df.iloc[:, 0]
tm.assert_series_equal(res_col, exp_col)
assert is_categorical_dtype(res_col)
assert is_categorical_dtype(res_col.dtype)

res_df = df.iloc[:, slice(0, 2)]
tm.assert_frame_equal(res_df, df)
assert is_categorical_dtype(res_df["cats"])
assert is_categorical_dtype(res_df["cats"].dtype)

res_df = df.iloc[:, [0, 1]]
tm.assert_frame_equal(res_df, df)
assert is_categorical_dtype(res_df["cats"])
assert is_categorical_dtype(res_df["cats"].dtype)

def test_slicing_doc_examples(self):

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ def test_categorical_order(self, file):

# Check identity of codes
for col in expected:
if is_categorical_dtype(expected[col]):
if is_categorical_dtype(expected[col].dtype):
tm.assert_series_equal(expected[col].cat.codes, parsed[col].cat.codes)
tm.assert_index_equal(
expected[col].cat.categories, parsed[col].cat.categories
Expand Down Expand Up @@ -1095,7 +1095,7 @@ def test_categorical_ordering(self, file):

parsed_unordered = read_stata(file, order_categoricals=False)
for col in parsed:
if not is_categorical_dtype(parsed[col]):
if not is_categorical_dtype(parsed[col].dtype):
continue
assert parsed[col].cat.ordered
assert not parsed_unordered[col].cat.ordered
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1729,7 +1729,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):

X = change(right.X.astype("object"))
right = right.assign(X=X)
assert is_categorical_dtype(left.X.values)
assert is_categorical_dtype(left.X.values.dtype)
# assert not left.X.values.is_dtype_equal(right.X.values)

merged = pd.merge(left, right, on="X", how=join_type)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,12 @@ def test_constructor_categorical_dtype(self):
result = pd.Series(
["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True)
)
assert is_categorical_dtype(result) is True
assert is_categorical_dtype(result.dtype) is True
tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"]))
assert result.cat.ordered

result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"]))
assert is_categorical_dtype(result)
assert is_categorical_dtype(result.dtype)
tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"]))
assert result.cat.ordered is False

Expand Down