Skip to content

ENH: maybe_convert_objects add boolean support with NA #50047

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def maybe_convert_objects(
convert_timedelta: Literal[False] = ...,
convert_period: Literal[False] = ...,
convert_interval: Literal[False] = ...,
convert_to_nullable_integer: Literal[False] = ...,
convert_to_nullable_dtype: Literal[False] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> npt.NDArray[np.object_ | np.number]: ...
@overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray
Expand All @@ -88,7 +88,7 @@ def maybe_convert_objects(
convert_timedelta: bool = ...,
convert_period: Literal[False] = ...,
convert_interval: Literal[False] = ...,
convert_to_nullable_integer: Literal[False] = ...,
convert_to_nullable_dtype: Literal[False] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> np.ndarray: ...
@overload
Expand All @@ -101,7 +101,7 @@ def maybe_convert_objects(
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: Literal[True] = ...,
convert_to_nullable_dtype: Literal[True] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
Expand All @@ -114,7 +114,7 @@ def maybe_convert_objects(
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: bool = ...,
convert_to_nullable_dtype: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
Expand All @@ -127,7 +127,7 @@ def maybe_convert_objects(
convert_timedelta: bool = ...,
convert_period: Literal[True] = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: bool = ...,
convert_to_nullable_dtype: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
Expand All @@ -140,7 +140,7 @@ def maybe_convert_objects(
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: bool = ...,
convert_to_nullable_dtype: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
Expand Down
27 changes: 17 additions & 10 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1309,10 +1309,14 @@ cdef class Seen:
@property
def is_bool(self):
# i.e. not (anything but bool)
return not (
self.datetime_ or self.datetimetz_ or self.timedelta_ or self.nat_
or self.period_ or self.interval_
or self.numeric_ or self.nan_ or self.null_ or self.object_
return self.is_bool_or_na and not (self.nan_ or self.null_)

@property
def is_bool_or_na(self):
# i.e. not (anything but bool or missing values)
return self.bool_ and not (
self.datetime_ or self.datetimetz_ or self.nat_ or self.timedelta_
or self.period_ or self.interval_ or self.numeric_ or self.object_
)


Expand Down Expand Up @@ -2335,7 +2339,7 @@ def maybe_convert_objects(ndarray[object] objects,
bint convert_timedelta=False,
bint convert_period=False,
bint convert_interval=False,
bint convert_to_nullable_integer=False,
bint convert_to_nullable_dtype=False,
object dtype_if_all_nat=None) -> "ArrayLike":
"""
Type inference function-- convert object array to proper dtype
Expand All @@ -2362,9 +2366,9 @@ def maybe_convert_objects(ndarray[object] objects,
convert_interval : bool, default False
If an array-like object contains only Interval objects (with matching
dtypes and closedness) or NaN, whether to convert to IntervalArray.
convert_to_nullable_integer : bool, default False
If an array-like object contains only integer values (and NaN) is
encountered, whether to convert and return an IntegerArray.
convert_to_nullable_dtype : bool, default False
If an array-like object contains only integer or boolean values (and NaN) is
encountered, whether to convert and return an Boolean/IntegerArray.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to confirm FloatArray isn't supported yet?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is something I raised in the sql pr as well. Currently, we only use ea, when the regular dtype can’t hold missing values. So there is no real use case for FloatArray.

it would probably make more sense to always convert when this flag is set, but this is out of scope here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. I would opt to returning Float64 when people specify use_nullable_dtypes=True, especially once (hopefully) other core devs find it useful to store NaN & NA

But sure can be a follow up

dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None
Dtype to cast to if we have all-NaT.

Expand Down Expand Up @@ -2446,7 +2450,7 @@ def maybe_convert_objects(ndarray[object] objects,
seen.int_ = True
floats[i] = <float64_t>val
complexes[i] = <double complex>val
if not seen.null_ or convert_to_nullable_integer:
if not seen.null_ or convert_to_nullable_dtype:
seen.saw_int(val)

if ((seen.uint_ and seen.sint_) or
Expand Down Expand Up @@ -2606,6 +2610,9 @@ def maybe_convert_objects(ndarray[object] objects,
if seen.is_bool:
# is_bool property rules out everything else
return bools.view(np.bool_)
elif convert_to_nullable_dtype and seen.is_bool_or_na:
from pandas.core.arrays import BooleanArray
return BooleanArray(bools.view(np.bool_), mask)
seen.object_ = True

if not seen.object_:
Expand All @@ -2617,7 +2624,7 @@ def maybe_convert_objects(ndarray[object] objects,
elif seen.float_:
result = floats
elif seen.int_ or seen.uint_:
if convert_to_nullable_integer:
if convert_to_nullable_dtype:
from pandas.core.arrays import IntegerArray
if seen.uint_:
result = IntegerArray(uints, mask)
Expand Down
26 changes: 24 additions & 2 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ def test_maybe_convert_objects_timedelta64_nat(self):
def test_maybe_convert_objects_nullable_integer(self, exp):
# GH27335
arr = np.array([2, np.NaN], dtype=object)
result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True)
result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)

tm.assert_extension_array_equal(result, exp)

Expand All @@ -869,7 +869,7 @@ def test_maybe_convert_objects_nullable_integer(self, exp):
def test_maybe_convert_objects_nullable_none(self, dtype, val):
# GH#50043
arr = np.array([val, None, 3], dtype="object")
result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True)
result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
expected = IntegerArray(
np.array([val, 0, 3], dtype=dtype), np.array([False, True, False])
)
Expand Down Expand Up @@ -930,6 +930,28 @@ def test_maybe_convert_objects_bool_nan(self):
out = lib.maybe_convert_objects(ind.values, safe=1)
tm.assert_numpy_array_equal(out, exp)

def test_maybe_convert_objects_nullable_boolean(self):
# GH50047
arr = np.array([True, False], dtype=object)
exp = np.array([True, False])
out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
tm.assert_numpy_array_equal(out, exp)

arr = np.array([True, False, pd.NaT], dtype=object)
exp = np.array([True, False, pd.NaT], dtype=object)
out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
tm.assert_numpy_array_equal(out, exp)

@pytest.mark.parametrize("val", [None, np.nan])
def test_maybe_convert_objects_nullable_boolean_na(self, val):
# GH50047
arr = np.array([True, False, val], dtype=object)
exp = BooleanArray(
np.array([True, False, False]), np.array([False, False, True])
)
out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
tm.assert_extension_array_equal(out, exp)

@pytest.mark.parametrize(
"data0",
[
Expand Down