Skip to content

PERF: dtype checks #52213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 14 additions & 20 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import sys
import time
import warnings

from pandas.errors import ParserError
from pandas.util._exceptions import find_stack_level

from pandas import StringDtype
Expand Down Expand Up @@ -106,15 +105,10 @@ from pandas.errors import (
ParserWarning,
)

from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
is_extension_array_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
ExtensionDtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.inference import is_dict_like

cdef:
Expand Down Expand Up @@ -1077,7 +1071,7 @@ cdef class TextReader:

# don't try to upcast EAs
if (
na_count > 0 and not is_extension_array_dtype(col_dtype)
na_count > 0 and not isinstance(col_dtype, ExtensionDtype)
or self.dtype_backend != "numpy"
):
use_dtype_backend = self.dtype_backend != "numpy" and col_dtype is None
Expand Down Expand Up @@ -1142,14 +1136,14 @@ cdef class TextReader:
# (see _try_bool_flex()). Usually this would be taken care of using
# _maybe_upcast(), but if col_dtype is a floating type we should just
# take care of that cast here.
if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
if col_res.dtype == np.bool_ and col_dtype.kind == "f":
mask = col_res.view(np.uint8) == na_values[np.uint8]
col_res = col_res.astype(col_dtype)
np.putmask(col_res, mask, np.nan)
return col_res, na_count

# NaNs are already cast to True here, so can not use astype
if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
if col_res.dtype == np.bool_ and col_dtype.kind in "iu":
if na_count > 0:
raise ValueError(
f"cannot safely convert passed user dtype of "
Expand Down Expand Up @@ -1193,14 +1187,14 @@ cdef class TextReader:
cats, codes, dtype, true_values=true_values)
return cat, na_count

elif is_extension_array_dtype(dtype):
elif isinstance(dtype, ExtensionDtype):
result, na_count = self._string_convert(i, start, end, na_filter,
na_hashset)

array_type = dtype.construct_array_type()
try:
# use _from_sequence_of_strings if the class defines it
if is_bool_dtype(dtype):
if dtype.kind == "b":
true_values = [x.decode() for x in self.true_values]
false_values = [x.decode() for x in self.false_values]
result = array_type._from_sequence_of_strings(
Expand All @@ -1216,7 +1210,7 @@ cdef class TextReader:

return result, na_count

elif is_integer_dtype(dtype):
elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
Expand All @@ -1233,14 +1227,14 @@ cdef class TextReader:

return result, na_count

elif is_float_dtype(dtype):
elif dtype.kind == "f":
result, na_count = _try_double(self.parser, i, start, end,
na_filter, na_hashset, na_flist)

if result is not None and dtype != "float64":
result = result.astype(dtype)
return result, na_count
elif is_bool_dtype(dtype):
elif dtype.kind == "b":
result, na_count = _try_bool_flex(self.parser, i, start, end,
na_filter, na_hashset,
self.true_set, self.false_set)
Expand All @@ -1267,10 +1261,10 @@ cdef class TextReader:
# unicode variable width
return self._string_convert(i, start, end, na_filter,
na_hashset)
elif is_object_dtype(dtype):
elif dtype == object:
return self._string_convert(i, start, end, na_filter,
na_hashset)
elif is_datetime64_dtype(dtype):
elif dtype.kind == "M":
raise TypeError(f"the dtype {dtype} is not supported "
f"for parsing, pass this column "
f"using parse_dates instead")
Expand Down Expand Up @@ -1438,7 +1432,7 @@ def _maybe_upcast(
-------
The casted array.
"""
if is_extension_array_dtype(arr.dtype):
if isinstance(arr.dtype, ExtensionDtype):
# TODO: the docstring says arr is an ndarray, in which case this cannot
# be reached. Is that incorrect?
return arr
Expand Down
15 changes: 10 additions & 5 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@

from pandas.core.dtypes.common import (
is_bool,
is_categorical_dtype,
is_extension_array_dtype,
is_integer_dtype,
is_interval_dtype,
is_number,
is_numeric_dtype,
needs_i8_conversion,
Expand All @@ -33,6 +31,7 @@
DataFrame,
DatetimeIndex,
Index,
IntervalDtype,
IntervalIndex,
MultiIndex,
PeriodIndex,
Expand Down Expand Up @@ -238,7 +237,9 @@ def _check_types(left, right, obj: str = "Index") -> None:
assert_attr_equal("inferred_type", left, right, obj=obj)

# Skip exact dtype checking when `check_categorical` is False
if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
if isinstance(left.dtype, CategoricalDtype) and isinstance(
right.dtype, CategoricalDtype
):
if check_categorical:
assert_attr_equal("dtype", left, right, obj=obj)
assert_index_equal(left.categories, right.categories, exact=exact)
Expand Down Expand Up @@ -335,7 +336,9 @@ def _get_ilevel_values(index, level):
assert_interval_array_equal(left._values, right._values)

if check_categorical:
if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
if isinstance(left.dtype, CategoricalDtype) or isinstance(
right.dtype, CategoricalDtype
):
assert_categorical_equal(left._values, right._values, obj=f"{obj} category")


Expand Down Expand Up @@ -946,7 +949,9 @@ def assert_series_equal(
f"is not equal to {right._values}."
)
raise AssertionError(msg)
elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype):
elif isinstance(left.dtype, IntervalDtype) and isinstance(
right.dtype, IntervalDtype
):
assert_interval_array_equal(left.array, right.array)
elif isinstance(left.dtype, CategoricalDtype) or isinstance(
right.dtype, CategoricalDtype
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
ensure_platform_int,
is_array_like,
is_bool_dtype,
is_categorical_dtype,
is_complex_dtype,
is_dict_like,
is_extension_array_dtype,
Expand All @@ -59,6 +58,7 @@
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
CategoricalDtype,
ExtensionDtype,
PandasDtype,
)
Expand Down Expand Up @@ -141,7 +141,7 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:
return _ensure_data(values._data)
return np.asarray(values)

elif is_categorical_dtype(values.dtype):
elif isinstance(values.dtype, CategoricalDtype):
# NB: cases that go through here should NOT be using _reconstruct_data
# on the back-end.
values = cast("Categorical", values)
Expand Down Expand Up @@ -417,7 +417,7 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
"""See algorithms.unique for docs. Takes a mask for masked arrays."""
values = _ensure_arraylike(values)

if is_extension_array_dtype(values.dtype):
if isinstance(values.dtype, ExtensionDtype):
# Dispatch to extension dtype's unique.
return values.unique()

Expand Down Expand Up @@ -1534,7 +1534,7 @@ def safe_sort(
ordered: AnyArrayLike

if (
not is_extension_array_dtype(values)
not isinstance(values.dtype, ExtensionDtype)
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
):
ordered = _sort_mixed(values)
Expand Down
Loading