diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 24cc1c6f9fa70..bd17b710b108d 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -5,6 +5,7 @@ from .pandas_vb_common import ( datetime_dtypes, extension_dtypes, + lib, numeric_dtypes, string_dtypes, ) @@ -40,4 +41,25 @@ def time_pandas_dtype_invalid(self, dtype): pass +class InferDtypes: + param_names = ["dtype"] + data_dict = { + "np-object": np.array([1] * 100000, dtype="O"), + "py-object": [1] * 100000, + "np-null": np.array([1] * 50000 + [np.nan] * 50000), + "py-null": [1] * 50000 + [None] * 50000, + "np-int": np.array([1] * 100000, dtype=int), + "np-floating": np.array([1.0] * 100000, dtype=float), + "empty": [], + "bytes": [b"a"] * 100000, + } + params = list(data_dict.keys()) + + def time_infer_skipna(self, dtype): + lib.infer_dtype(self.data_dict[dtype], skipna=True) + + def time_infer(self, dtype): + lib.infer_dtype(self.data_dict[dtype], skipna=False) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c288a008777cf..a24e06d735447 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -659,6 +659,7 @@ Performance improvements - Performance improvement when checking if values in a :class:`Categorical` are equal, equal or larger or larger than a given scalar. The improvement is not present if checking if the :class:`Categorical` is less than or less than or equal than the scalar (:issue:`29820`) - Performance improvement in :meth:`Index.equals` and :meth:`MultiIndex.equals` (:issue:`29134`) +- Performance improvement in :func:`~pandas.api.types.infer_dtype` when ``skipna`` is ``True`` (:issue:`28814`) .. _whatsnew_1000.bug_fixes: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e9a486894fbf0..ac73cf5e7ec9a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1259,9 +1259,6 @@ def infer_dtype(value: object, skipna: bool = True) -> str: # make contiguous values = values.ravel() - if skipna: - values = values[~isnaobj(values)] - val = _try_infer_map(values) if val is not None: return val @@ -1269,6 +1266,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if values.dtype != np.object_: values = values.astype('O') + if skipna: + values = values[~isnaobj(values)] + n = len(values) if n == 0: return 'empty'