Skip to content

Commit a6890c6

Browse files
authored
PERF: use fastpath for is_categorical_dtype calls (#33945)
1 parent c5dad15 commit a6890c6

File tree

16 files changed

+41
-34
lines changed

16 files changed

+41
-34
lines changed

pandas/_testing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,7 @@ def _get_ilevel_values(index, level):
718718
assert_interval_array_equal(left._values, right._values)
719719

720720
if check_categorical:
721-
if is_categorical_dtype(left) or is_categorical_dtype(right):
721+
if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
722722
assert_categorical_equal(left._values, right._values, obj=f"{obj} category")
723723

724724

@@ -1250,7 +1250,7 @@ def assert_series_equal(
12501250
assert_attr_equal("name", left, right, obj=obj)
12511251

12521252
if check_categorical:
1253-
if is_categorical_dtype(left) or is_categorical_dtype(right):
1253+
if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
12541254
assert_categorical_equal(
12551255
left._values,
12561256
right._values,

pandas/core/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1123,7 +1123,7 @@ def _map_values(self, mapper, na_action=None):
11231123
if isinstance(mapper, ABCSeries):
11241124
# Since values were input this means we came from either
11251125
# a dict or a series and mapper should be an index
1126-
if is_categorical_dtype(self._values):
1126+
if is_categorical_dtype(self.dtype):
11271127
# use the built in categorical series mapper which saves
11281128
# time by mapping the categories instead of all values
11291129
return self._values.map(mapper)

pandas/core/groupby/ops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ def _cython_operation(
458458

459459
# categoricals are only 1d, so we
460460
# are not setup for dim transforming
461-
if is_categorical_dtype(values) or is_sparse(values):
461+
if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
462462
raise NotImplementedError(f"{values.dtype} dtype not supported")
463463
elif is_datetime64_any_dtype(values):
464464
if how in ["add", "prod", "cumsum", "cumprod"]:

pandas/core/indexes/accessors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ def __new__(cls, data: "Series"):
434434
f"cannot convert an object of type {type(data)} to a datetimelike index"
435435
)
436436

437-
orig = data if is_categorical_dtype(data) else None
437+
orig = data if is_categorical_dtype(data.dtype) else None
438438
if orig is not None:
439439
data = data._constructor(
440440
orig.array,

pandas/core/indexes/base.py

+3
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,9 @@ def astype(self, dtype, copy=True):
631631
Index
632632
Index with values cast to specified dtype.
633633
"""
634+
if dtype is not None:
635+
dtype = pandas_dtype(dtype)
636+
634637
if is_dtype_equal(self.dtype, dtype):
635638
return self.copy() if copy else self
636639

pandas/core/indexes/multi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ def values(self):
651651

652652
for i in range(self.nlevels):
653653
vals = self._get_level_values(i)
654-
if is_categorical_dtype(vals):
654+
if is_categorical_dtype(vals.dtype):
655655
vals = vals._internal_get_values()
656656
if isinstance(vals.dtype, ExtensionDtype) or isinstance(
657657
vals, (ABCDatetimeIndex, ABCTimedeltaIndex)

pandas/core/internals/blocks.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -535,10 +535,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
535535
)
536536
raise TypeError(msg)
537537

538+
if dtype is not None:
539+
dtype = pandas_dtype(dtype)
540+
538541
# may need to convert to categorical
539542
if is_categorical_dtype(dtype):
540543

541-
if is_categorical_dtype(self.values):
544+
if is_categorical_dtype(self.values.dtype):
542545
# GH 10696/18593: update an existing categorical efficiently
543546
return self.make_block(self.values.astype(dtype, copy=copy))
544547

pandas/core/strings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2092,7 +2092,7 @@ class StringMethods(NoNewAttributesMixin):
20922092

20932093
def __init__(self, data):
20942094
self._inferred_dtype = self._validate(data)
2095-
self._is_categorical = is_categorical_dtype(data)
2095+
self._is_categorical = is_categorical_dtype(data.dtype)
20962096
self._is_string = data.dtype.name == "string"
20972097

20982098
# ._values.categories works for both Series/Index

pandas/io/pytables.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -2213,19 +2213,20 @@ def take_data(self):
22132213
return self.data
22142214

22152215
@classmethod
2216-
def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col":
2216+
def _get_atom(cls, values: ArrayLike) -> "Col":
22172217
"""
22182218
Get an appropriately typed and shaped pytables.Col object for values.
22192219
"""
22202220
dtype = values.dtype
2221-
itemsize = dtype.itemsize
2221+
itemsize = dtype.itemsize # type: ignore
22222222

22232223
shape = values.shape
22242224
if values.ndim == 1:
22252225
# EA, use block shape pretending it is 2D
2226+
# TODO(EA2D): not necessary with 2D EAs
22262227
shape = (1, values.size)
22272228

2228-
if is_categorical_dtype(dtype):
2229+
if isinstance(values, Categorical):
22292230
codes = values.codes
22302231
atom = cls.get_atom_data(shape, kind=codes.dtype.name)
22312232
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
@@ -2887,7 +2888,7 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None)
28872888
empty_array = value.size == 0
28882889
transposed = False
28892890

2890-
if is_categorical_dtype(value):
2891+
if is_categorical_dtype(value.dtype):
28912892
raise NotImplementedError(
28922893
"Cannot store a category dtype in a HDF5 dataset that uses format="
28932894
'"fixed". Use format="table".'
@@ -3795,7 +3796,7 @@ def get_blk_items(mgr, blocks):
37953796
tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None
37963797

37973798
meta = metadata = ordered = None
3798-
if is_categorical_dtype(data_converted):
3799+
if is_categorical_dtype(data_converted.dtype):
37993800
ordered = data_converted.ordered
38003801
meta = "category"
38013802
metadata = np.array(data_converted.categories, copy=False).ravel()

pandas/io/stata.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2132,7 +2132,7 @@ def _prepare_categoricals(self, data: DataFrame) -> DataFrame:
21322132
Check for categorical columns, retain categorical information for
21332133
Stata file and convert categorical data to int
21342134
"""
2135-
is_cat = [is_categorical_dtype(data[col]) for col in data]
2135+
is_cat = [is_categorical_dtype(data[col].dtype) for col in data]
21362136
self._is_col_cat = is_cat
21372137
self._value_labels: List[StataValueLabel] = []
21382138
if not any(is_cat):

pandas/tests/base/test_misc.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ def test_memory_usage(index_or_series_obj):
122122
is_object = is_object_dtype(obj) or (
123123
isinstance(obj, Series) and is_object_dtype(obj.index)
124124
)
125-
is_categorical = is_categorical_dtype(obj) or (
126-
isinstance(obj, Series) and is_categorical_dtype(obj.index)
125+
is_categorical = is_categorical_dtype(obj.dtype) or (
126+
isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype)
127127
)
128128

129129
if len(obj) == 0:

pandas/tests/frame/test_alter_axes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,9 @@ def test_setitem(self):
234234
df["D"] = s.values
235235
df["E"] = np.array(s.values)
236236

237-
assert is_categorical_dtype(df["B"])
237+
assert is_categorical_dtype(df["B"].dtype)
238238
assert is_interval_dtype(df["B"].cat.categories)
239-
assert is_categorical_dtype(df["D"])
239+
assert is_categorical_dtype(df["D"].dtype)
240240
assert is_interval_dtype(df["D"].cat.categories)
241241

242242
assert is_object_dtype(df["C"])

pandas/tests/indexing/test_categorical.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def test_slicing_and_getting_ops(self):
156156
# frame
157157
res_df = df.iloc[2:4, :]
158158
tm.assert_frame_equal(res_df, exp_df)
159-
assert is_categorical_dtype(res_df["cats"])
159+
assert is_categorical_dtype(res_df["cats"].dtype)
160160

161161
# row
162162
res_row = df.iloc[2, :]
@@ -166,7 +166,7 @@ def test_slicing_and_getting_ops(self):
166166
# col
167167
res_col = df.iloc[:, 0]
168168
tm.assert_series_equal(res_col, exp_col)
169-
assert is_categorical_dtype(res_col)
169+
assert is_categorical_dtype(res_col.dtype)
170170

171171
# single value
172172
res_val = df.iloc[2, 0]
@@ -176,7 +176,7 @@ def test_slicing_and_getting_ops(self):
176176
# frame
177177
res_df = df.loc["j":"k", :]
178178
tm.assert_frame_equal(res_df, exp_df)
179-
assert is_categorical_dtype(res_df["cats"])
179+
assert is_categorical_dtype(res_df["cats"].dtype)
180180

181181
# row
182182
res_row = df.loc["j", :]
@@ -186,7 +186,7 @@ def test_slicing_and_getting_ops(self):
186186
# col
187187
res_col = df.loc[:, "cats"]
188188
tm.assert_series_equal(res_col, exp_col)
189-
assert is_categorical_dtype(res_col)
189+
assert is_categorical_dtype(res_col.dtype)
190190

191191
# single value
192192
res_val = df.loc["j", "cats"]
@@ -197,7 +197,7 @@ def test_slicing_and_getting_ops(self):
197197
# res_df = df.loc["j":"k",[0,1]] # doesn't work?
198198
res_df = df.loc["j":"k", :]
199199
tm.assert_frame_equal(res_df, exp_df)
200-
assert is_categorical_dtype(res_df["cats"])
200+
assert is_categorical_dtype(res_df["cats"].dtype)
201201

202202
# row
203203
res_row = df.loc["j", :]
@@ -207,7 +207,7 @@ def test_slicing_and_getting_ops(self):
207207
# col
208208
res_col = df.loc[:, "cats"]
209209
tm.assert_series_equal(res_col, exp_col)
210-
assert is_categorical_dtype(res_col)
210+
assert is_categorical_dtype(res_col.dtype)
211211

212212
# single value
213213
res_val = df.loc["j", df.columns[0]]
@@ -240,23 +240,23 @@ def test_slicing_and_getting_ops(self):
240240

241241
res_df = df.iloc[slice(2, 4)]
242242
tm.assert_frame_equal(res_df, exp_df)
243-
assert is_categorical_dtype(res_df["cats"])
243+
assert is_categorical_dtype(res_df["cats"].dtype)
244244

245245
res_df = df.iloc[[2, 3]]
246246
tm.assert_frame_equal(res_df, exp_df)
247-
assert is_categorical_dtype(res_df["cats"])
247+
assert is_categorical_dtype(res_df["cats"].dtype)
248248

249249
res_col = df.iloc[:, 0]
250250
tm.assert_series_equal(res_col, exp_col)
251-
assert is_categorical_dtype(res_col)
251+
assert is_categorical_dtype(res_col.dtype)
252252

253253
res_df = df.iloc[:, slice(0, 2)]
254254
tm.assert_frame_equal(res_df, df)
255-
assert is_categorical_dtype(res_df["cats"])
255+
assert is_categorical_dtype(res_df["cats"].dtype)
256256

257257
res_df = df.iloc[:, [0, 1]]
258258
tm.assert_frame_equal(res_df, df)
259-
assert is_categorical_dtype(res_df["cats"])
259+
assert is_categorical_dtype(res_df["cats"].dtype)
260260

261261
def test_slicing_doc_examples(self):
262262

pandas/tests/io/test_stata.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1065,7 +1065,7 @@ def test_categorical_order(self, file):
10651065

10661066
# Check identity of codes
10671067
for col in expected:
1068-
if is_categorical_dtype(expected[col]):
1068+
if is_categorical_dtype(expected[col].dtype):
10691069
tm.assert_series_equal(expected[col].cat.codes, parsed[col].cat.codes)
10701070
tm.assert_index_equal(
10711071
expected[col].cat.categories, parsed[col].cat.categories
@@ -1095,7 +1095,7 @@ def test_categorical_ordering(self, file):
10951095

10961096
parsed_unordered = read_stata(file, order_categoricals=False)
10971097
for col in parsed:
1098-
if not is_categorical_dtype(parsed[col]):
1098+
if not is_categorical_dtype(parsed[col].dtype):
10991099
continue
11001100
assert parsed[col].cat.ordered
11011101
assert not parsed_unordered[col].cat.ordered

pandas/tests/reshape/merge/test_merge.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1729,7 +1729,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right):
17291729

17301730
X = change(right.X.astype("object"))
17311731
right = right.assign(X=X)
1732-
assert is_categorical_dtype(left.X.values)
1732+
assert is_categorical_dtype(left.X.values.dtype)
17331733
# assert not left.X.values.is_dtype_equal(right.X.values)
17341734

17351735
merged = pd.merge(left, right, on="X", how=join_type)

pandas/tests/series/test_constructors.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -377,12 +377,12 @@ def test_constructor_categorical_dtype(self):
377377
result = pd.Series(
378378
["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True)
379379
)
380-
assert is_categorical_dtype(result) is True
380+
assert is_categorical_dtype(result.dtype) is True
381381
tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"]))
382382
assert result.cat.ordered
383383

384384
result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"]))
385-
assert is_categorical_dtype(result)
385+
assert is_categorical_dtype(result.dtype)
386386
tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"]))
387387
assert result.cat.ordered is False
388388

0 commit comments

Comments
 (0)