Skip to content

BUG/TST: Fix infer_dtype for Period array-likes and general ExtensionArrays #37367

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Feb 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,8 @@ ExtensionArray
Other
^^^^^
- Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`)
- Bug in :func:`pandas.api.types.infer_dtype` not recognizing Series, Index or array with a period dtype (:issue:`23553`)
- Bug in :func:`pandas.api.types.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`)
- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
- Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`)
- ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`)
Expand Down
20 changes: 12 additions & 8 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ from pandas._libs cimport util
from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan

from pandas._libs.tslib import array_to_datetime
from pandas._libs.tslibs.period import Period

from pandas._libs.missing cimport (
C_NA,
Expand Down Expand Up @@ -1082,6 +1083,7 @@ _TYPE_MAP = {
"timedelta64[ns]": "timedelta64",
"m": "timedelta64",
"interval": "interval",
Period: "period",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prob need Interval here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interval is already handled by the "interval" string in the line above, see the non-inline comment with the link to PR were this was fixed before

}

# types only exist on certain platform
Expand Down Expand Up @@ -1233,8 +1235,8 @@ cdef object _try_infer_map(object dtype):
cdef:
object val
str attr
for attr in ["name", "kind", "base"]:
val = getattr(dtype, attr)
for attr in ["name", "kind", "base", "type"]:
val = getattr(dtype, attr, None)
if val in _TYPE_MAP:
return _TYPE_MAP[val]
return None
Expand Down Expand Up @@ -1275,6 +1277,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
- time
- period
- mixed
- unknown-array

Raises
------
Expand All @@ -1287,6 +1290,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
specialized
- 'mixed-integer-float' are floats and integers
- 'mixed-integer' are integers mixed with non-integers
- 'unknown-array' is the catchall for something that *is* an array (has
a dtype attribute), but has a dtype unknown to pandas (e.g. external
extension array)

Examples
--------
Expand Down Expand Up @@ -1355,12 +1361,10 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
# e.g. categoricals
dtype = value.dtype
if not isinstance(dtype, np.dtype):
value = _try_infer_map(value.dtype)
if value is not None:
return value

# its ndarray-like but we can't handle
raise ValueError(f"cannot infer type for {type(value)}")
inferred = _try_infer_map(value.dtype)
if inferred is not None:
return inferred
return "unknown-array"

# Unwrap Series/Index
values = np.asarray(value)
Expand Down
6 changes: 1 addition & 5 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,7 @@ def _validate(data):
if isinstance(values.dtype, StringDtype):
return "string"

try:
inferred_dtype = lib.infer_dtype(values, skipna=True)
except ValueError:
# GH#27571 mostly occurs with ExtensionArray
inferred_dtype = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's another case almost identical to this in dtypes.cast.convert_dtypes

inferred_dtype = lib.infer_dtype(values, skipna=True)

if inferred_dtype not in allowed_types:
raise AttributeError("Can only use .str accessor with string values!")
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,19 @@ def test_infer_dtype_period(self):
arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")])
assert lib.infer_dtype(arr, skipna=True) == "period"

@pytest.mark.parametrize("klass", [pd.array, pd.Series, pd.Index])
@pytest.mark.parametrize("skipna", [True, False])
def test_infer_dtype_period_array(self, klass, skipna):
# https://github.com/pandas-dev/pandas/issues/23553
values = klass(
[
Period("2011-01-01", freq="D"),
Period("2011-01-02", freq="D"),
pd.NaT,
]
)
assert lib.infer_dtype(values, skipna=skipna) == "period"

def test_infer_dtype_period_mixed(self):
arr = np.array(
[Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object
Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/extension/base/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

import pandas as pd
from pandas.api.types import is_object_dtype, is_string_dtype
from pandas.api.types import infer_dtype, is_object_dtype, is_string_dtype
from pandas.tests.extension.base.base import BaseExtensionTests


Expand Down Expand Up @@ -123,3 +123,11 @@ def test_get_common_dtype(self, dtype):
# still testing as good practice to have this working (and it is the
# only case we can test in general)
assert dtype._get_common_dtype([dtype]) == dtype

@pytest.mark.parametrize("skipna", [True, False])
def test_infer_dtype(self, data, data_missing, skipna):
# only testing that this works without raising an error
res = infer_dtype(data, skipna=skipna)
assert isinstance(res, str)
res = infer_dtype(data_missing, skipna=skipna)
assert isinstance(res, str)
8 changes: 8 additions & 0 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.api.types import infer_dtype
from pandas.tests.extension import base
from pandas.tests.extension.decimal.array import (
DecimalArray,
Expand Down Expand Up @@ -120,6 +121,13 @@ class TestDtype(BaseDecimal, base.BaseDtypeTests):
def test_hashable(self, dtype):
pass

@pytest.mark.parametrize("skipna", [True, False])
def test_infer_dtype(self, data, data_missing, skipna):
# here overriding base test to ensure we fall back to return
# "unknown-array" for an EA pandas doesn't know
assert infer_dtype(data, skipna=skipna) == "unknown-array"
assert infer_dtype(data_missing, skipna=skipna) == "unknown-array"


class TestInterface(BaseDecimal, base.BaseInterfaceTests):
pass
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,7 +938,8 @@ def test_unsupported(self, fp):

# period
df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)})
self.check_error_on_write(df, fp, ValueError, "cannot infer type for")
# error from fastparquet -> don't check exact error message
self.check_error_on_write(df, fp, ValueError, None)

# mixed
df = pd.DataFrame({"a": ["a", 1, 2.0]})
Expand Down