From a3b7ad719095fee970362ce8e5d203ef32213647 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Oct 2020 21:06:02 +0200 Subject: [PATCH 1/8] BUG/TST: Fix infer_dtype for Period array-likes and general ExtensionArrays --- pandas/_libs/lib.pyx | 16 +++++++++++----- pandas/tests/dtypes/test_inference.py | 13 +++++++++++++ pandas/tests/extension/base/dtype.py | 8 ++++++++ pandas/tests/extension/decimal/test_decimal.py | 7 +++++++ 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 001fbae120ae8..72e516a1065e1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1050,6 +1050,7 @@ _TYPE_MAP = { "M": "datetime64", "timedelta64[ns]": "timedelta64", "m": "timedelta64", + "period": "period", "interval": "interval", } @@ -1203,9 +1204,14 @@ cdef object _try_infer_map(object dtype): object val str attr for attr in ["name", "kind", "base"]: - val = getattr(dtype, attr) + val = getattr(dtype, attr, None) if val in _TYPE_MAP: return _TYPE_MAP[val] + # also check base name for parametrized dtypes (eg period[D]) + if isinstance(val, str): + val = val.split("[")[0] + if val in _TYPE_MAP: + return _TYPE_MAP[val] return None @@ -1324,12 +1330,12 @@ def infer_dtype(value: object, skipna: bool = True) -> str: # e.g. categoricals dtype = value.dtype if not isinstance(dtype, np.dtype): - value = _try_infer_map(value.dtype) - if value is not None: - return value + inferred = _try_infer_map(value.dtype) + if inferred is not None: + return inferred # its ndarray-like but we can't handle - raise ValueError(f"cannot infer type for {type(value)}") + values = np.asarray(value) # Unwrap Series/Index values = np.asarray(value) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index afae6ab7b9c07..155a91688d5e2 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -890,6 +890,19 @@ def test_infer_dtype_period(self): arr = np.array([pd.Period("2011-01", freq="D"), pd.Period("2011-02", freq="M")]) assert lib.infer_dtype(arr, skipna=True) == "period" + @pytest.mark.parametrize("klass", [pd.array, pd.Series, pd.Index]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_infer_dtype_period_array(self, klass, skipna): + # https://github.com/pandas-dev/pandas/issues/23553 + values = klass( + [ + pd.Period("2011-01-01", freq="D"), + pd.Period("2011-01-02", freq="D"), + pd.NaT, + ] + ) + assert lib.infer_dtype(values, skipna=skipna) == "period" + def test_infer_dtype_period_mixed(self): arr = np.array( [pd.Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 154fcdc38826d..a55f85da9e15f 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -123,3 +123,11 @@ def test_get_common_dtype(self, dtype): # still testing as good practice to have this working (and it is the # only case we can test in general) assert dtype._get_common_dtype([dtype]) == dtype + + @pytest.mark.parametrize("skipna", [True, False]) + def test_infer_dtype(self, data, data_missing, skipna): + # only testing that this works without raising an error + res = pd.api.types.infer_dtype(data, skipna=skipna) + assert isinstance(res, str) + res = pd.api.types.infer_dtype(data_missing, skipna=skipna) + assert isinstance(res, str) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 233b658d29782..bb6dac102a3a0 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -116,6 +116,13 @@ class TestDtype(BaseDecimal, base.BaseDtypeTests): def test_hashable(self, dtype): pass + @pytest.mark.parametrize("skipna", [True, False]) + def test_infer_dtype(self, data, data_missing, skipna): + # here overriding base test to ensure we fall back to inferring the + # array elements for external EAs (which here are recognized as decimals) + assert pd.api.types.infer_dtype(data, skipna=skipna) == "decimal" + assert pd.api.types.infer_dtype(data_missing, skipna=skipna) == "decimal" + class TestInterface(BaseDecimal, base.BaseInterfaceTests): pass From d6f6c0ad724b0aef0d8695f95afae5457f7b6a58 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Nov 2020 21:05:01 +0100 Subject: [PATCH 2/8] remove duplicate asarray --- pandas/_libs/lib.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 75e53a3151322..cf714ebcaab0a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1357,9 +1357,6 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if inferred is not None: return inferred - # its ndarray-like but we can't handle - values = np.asarray(value) - # Unwrap Series/Index values = np.asarray(value) From 50d7425adbeb62781a15dbe1f21a9f1b894d70f7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Nov 2020 21:18:17 +0100 Subject: [PATCH 3/8] temp --- pandas/core/strings/accessor.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7d6a2bf1d776d..0e7b384e28460 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -209,11 +209,7 @@ def _validate(data): if isinstance(values.dtype, StringDtype): return "string" - try: - inferred_dtype = lib.infer_dtype(values, skipna=True) - except ValueError: - # GH#27571 mostly occurs with ExtensionArray - inferred_dtype = None + inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: raise AttributeError("Can only use .str accessor with string values!") From ce64b64eafa739d73263c444d7f839ade1e97f92 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Nov 2020 20:33:10 +0100 Subject: [PATCH 4/8] return unknown-array --- pandas/_libs/lib.pyx | 5 +++++ pandas/tests/dtypes/test_inference.py | 4 ++-- pandas/tests/extension/decimal/test_decimal.py | 8 ++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cf714ebcaab0a..36f0e1ed8315c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1273,6 +1273,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: - time - period - mixed + - unknown-array Raises ------ @@ -1285,6 +1286,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str: specialized - 'mixed-integer-float' are floats and integers - 'mixed-integer' are integers mixed with non-integers + - 'unknown-array' is the catchall for something that *is* an array (has + a dtype attribute), but has a dtype unknown to pandas (e.g. external + extension array) Examples -------- @@ -1356,6 +1360,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: inferred = _try_infer_map(value.dtype) if inferred is not None: return inferred + return "unknown-array" # Unwrap Series/Index values = np.asarray(value) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b298f9b48c3c7..d5b2a41dab6db 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -897,8 +897,8 @@ def test_infer_dtype_period_array(self, klass, skipna): # https://github.com/pandas-dev/pandas/issues/23553 values = klass( [ - pd.Period("2011-01-01", freq="D"), - pd.Period("2011-01-02", freq="D"), + Period("2011-01-01", freq="D"), + Period("2011-01-02", freq="D"), pd.NaT, ] ) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index bb6dac102a3a0..aa03ea9460b8f 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -118,10 +118,10 @@ def test_hashable(self, dtype): @pytest.mark.parametrize("skipna", [True, False]) def test_infer_dtype(self, data, data_missing, skipna): - # here overriding base test to ensure we fall back to inferring the - # array elements for external EAs (which here are recognized as decimals) - assert pd.api.types.infer_dtype(data, skipna=skipna) == "decimal" - assert pd.api.types.infer_dtype(data_missing, skipna=skipna) == "decimal" + # here overriding base test to ensure we fall back to return + # "unknown-array" for an EA pandas doesn't know + assert pd.api.types.infer_dtype(data, skipna=skipna) == "unknown-array" + assert pd.api.types.infer_dtype(data_missing, skipna=skipna) == "unknown-array" class TestInterface(BaseDecimal, base.BaseInterfaceTests): From 688104fc2779dba37b7b5f945c7378eee3b2ab1d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Jan 2021 19:29:59 +0100 Subject: [PATCH 5/8] add Period class in type map + check dtype.type --- pandas/_libs/lib.pyx | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3c61b036ca2c5..66c360480c605 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -69,6 +69,7 @@ from pandas._libs cimport util from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan from pandas._libs.tslib import array_to_datetime +from pandas._libs.tslibs.period import Period from pandas._libs.missing cimport ( C_NA, @@ -1078,8 +1079,8 @@ _TYPE_MAP = { "M": "datetime64", "timedelta64[ns]": "timedelta64", "m": "timedelta64", - "period": "period", "interval": "interval", + Period: "period", } # types only exist on certain platform @@ -1231,15 +1232,10 @@ cdef object _try_infer_map(object dtype): cdef: object val str attr - for attr in ["name", "kind", "base"]: + for attr in ["name", "kind", "base", "type"]: val = getattr(dtype, attr, None) if val in _TYPE_MAP: return _TYPE_MAP[val] - # also check base name for parametrized dtypes (eg period[D]) - if isinstance(val, str): - val = val.split("[")[0] - if val in _TYPE_MAP: - return _TYPE_MAP[val] return None From 4c539e3a394da02dbd3b1c1754c8f6c3e2f410ca Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 18 Jan 2021 11:30:35 +0100 Subject: [PATCH 6/8] add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6a85bfd852e19..d06992b450b2b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -366,8 +366,8 @@ ExtensionArray Other ^^^^^ - Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`) -- -- +- Bug in :func:`pandas.api.types.infer_dtype` not recognizing Series, Index or array with a period dtype (:issue:`23553`) +- Bug in :func:`pandas.api.types.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`) .. --------------------------------------------------------------------------- From 88dcd81e92748df7b5d62fad8c134a66a188bac1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Feb 2021 13:42:06 +0100 Subject: [PATCH 7/8] fixup usage of pd.api.type --- pandas/tests/extension/base/dtype.py | 6 +++--- pandas/tests/extension/decimal/test_decimal.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index cbb5e12ad89ce..b63af0c22b450 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas.api.types import is_object_dtype, is_string_dtype +from pandas.api.types import infer_dtype, is_object_dtype, is_string_dtype from pandas.tests.extension.base.base import BaseExtensionTests @@ -127,7 +127,7 @@ def test_get_common_dtype(self, dtype): @pytest.mark.parametrize("skipna", [True, False]) def test_infer_dtype(self, data, data_missing, skipna): # only testing that this works without raising an error - res = pd.api.types.infer_dtype(data, skipna=skipna) + res = infer_dtype(data, skipna=skipna) assert isinstance(res, str) - res = pd.api.types.infer_dtype(data_missing, skipna=skipna) + res = infer_dtype(data_missing, skipna=skipna) assert isinstance(res, str) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b6438efcd138d..23b1ce250a5e5 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -7,6 +7,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import infer_dtype from pandas.tests.extension import base from pandas.tests.extension.decimal.array import ( DecimalArray, @@ -124,8 +125,8 @@ def test_hashable(self, dtype): def test_infer_dtype(self, data, data_missing, skipna): # here overriding base test to ensure we fall back to return # "unknown-array" for an EA pandas doesn't know - assert pd.api.types.infer_dtype(data, skipna=skipna) == "unknown-array" - assert pd.api.types.infer_dtype(data_missing, skipna=skipna) == "unknown-array" + assert infer_dtype(data, skipna=skipna) == "unknown-array" + assert infer_dtype(data_missing, skipna=skipna) == "unknown-array" class TestInterface(BaseDecimal, base.BaseInterfaceTests): From 100b83b10583d935bbfcc5c7a45ab3866a545213 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Feb 2021 15:01:21 +0100 Subject: [PATCH 8/8] fix error message from fastparquet --- pandas/tests/io/test_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 5adb3730a8c86..c72363b088a5c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -938,7 +938,8 @@ def test_unsupported(self, fp): # period df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) - self.check_error_on_write(df, fp, ValueError, "cannot infer type for") + # error from fastparquet -> don't check exact error message + self.check_error_on_write(df, fp, ValueError, None) # mixed df = pd.DataFrame({"a": ["a", 1, 2.0]})