From 3d6958dc8d06b67592035679ca23cddbc41f0b09 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 21:19:33 +0100 Subject: [PATCH 1/5] ENH: maybe_convert_objects add boolean support with NA --- pandas/_libs/lib.pyi | 12 ++++++------ pandas/_libs/lib.pyx | 21 ++++++++++++++++----- pandas/tests/dtypes/test_inference.py | 24 +++++++++++++++++++++++- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 3cbc04fb2f5cd..9bc02e90ebb9e 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -75,7 +75,7 @@ def maybe_convert_objects( convert_timedelta: Literal[False] = ..., convert_period: Literal[False] = ..., convert_interval: Literal[False] = ..., - convert_to_nullable_integer: Literal[False] = ..., + convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray @@ -88,7 +88,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[False] = ..., convert_interval: Literal[False] = ..., - convert_to_nullable_integer: Literal[False] = ..., + convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> np.ndarray: ... @overload @@ -101,7 +101,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_interval: bool = ..., - convert_to_nullable_integer: Literal[True] = ..., + convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -114,7 +114,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_interval: bool = ..., - convert_to_nullable_integer: bool = ..., + convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -127,7 +127,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[True] = ..., convert_interval: bool = ..., - convert_to_nullable_integer: bool = ..., + convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -140,7 +140,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_interval: bool = ..., - convert_to_nullable_integer: bool = ..., + convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e35cf2fb13768..de10626f3e574 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1315,6 +1315,14 @@ cdef class Seen: or self.numeric_ or self.nan_ or self.null_ or self.object_ ) + @property + def is_bool_or_na(self): + # i.e. not (anything but bool or missing values) + return not ( + self.datetime_ or self.datetimetz_ or self.nat_ or self.timedelta_ + or self.period_ or self.interval_ or self.numeric_ or self.object_ + ) + cdef object _try_infer_map(object dtype): """ @@ -2335,7 +2343,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_timedelta=False, bint convert_period=False, bint convert_interval=False, - bint convert_to_nullable_integer=False, + bint convert_to_nullable_dtype=False, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2362,9 +2370,9 @@ def maybe_convert_objects(ndarray[object] objects, convert_interval : bool, default False If an array-like object contains only Interval objects (with matching dtypes and closedness) or NaN, whether to convert to IntervalArray. - convert_to_nullable_integer : bool, default False - If an array-like object contains only integer values (and NaN) is - encountered, whether to convert and return an IntegerArray. + convert_to_nullable_dtype : bool, default False + If an array-like object contains only integer or boolean values (and NaN) is + encountered, whether to convert and return an Boolean/IntegerArray. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. @@ -2606,6 +2614,9 @@ def maybe_convert_objects(ndarray[object] objects, if seen.is_bool: # is_bool property rules out everything else return bools.view(np.bool_) + elif convert_to_nullable_dtype and seen.is_bool_or_na: + from pandas.core.arrays import BooleanArray + return BooleanArray(bools.view(np.bool_), mask) seen.object_ = True if not seen.object_: @@ -2617,7 +2628,7 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.float_: result = floats elif seen.int_: - if convert_to_nullable_integer: + if convert_to_nullable_dtype: from pandas.core.arrays import IntegerArray result = IntegerArray(ints, mask) else: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index df2afad51abf8..4a8099d5b2b0f 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -859,7 +859,7 @@ def test_maybe_convert_objects_timedelta64_nat(self): def test_maybe_convert_objects_nullable_integer(self, exp): # GH27335 arr = np.array([2, np.NaN], dtype=object) - result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) + result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) tm.assert_extension_array_equal(result, exp) @@ -918,6 +918,28 @@ def test_maybe_convert_objects_bool_nan(self): out = lib.maybe_convert_objects(ind.values, safe=1) tm.assert_numpy_array_equal(out, exp) + def test_maybe_convert_objects_nullable_boolean(self): + # GH + arr = np.array([True, False], dtype=object) + exp = np.array([True, False]) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_numpy_array_equal(out, exp) + + arr = np.array([True, False, pd.NaT], dtype=object) + exp = np.array([True, False, pd.NaT], dtype=object) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_numpy_array_equal(out, exp) + + @pytest.mark.parametrize("val", [None, np.nan]) + def test_maybe_convert_objects_nullable_boolean_na(self, val): + # GH + arr = np.array([True, False, val], dtype=object) + exp = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_extension_array_equal(out, exp) + @pytest.mark.parametrize( "data0", [ From 43545c5456d42eb64403e5b2260be65c7e7126c0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 21:51:01 +0100 Subject: [PATCH 2/5] Fix merge error --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5361bb499240b..726fe79c2d702 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2454,7 +2454,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.int_ = True floats[i] = val complexes[i] = val - if not seen.null_ or convert_to_nullable_integer: + if not seen.null_ or convert_to_nullable_dtype: seen.saw_int(val) if ((seen.uint_ and seen.sint_) or From 1ed72bfa1fb53925575f00223c4121e234acd78b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 21:52:25 +0100 Subject: [PATCH 3/5] Add gh ref --- pandas/tests/dtypes/test_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 4a6d480dcc988..b075718a678d6 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -931,7 +931,7 @@ def test_maybe_convert_objects_bool_nan(self): tm.assert_numpy_array_equal(out, exp) def test_maybe_convert_objects_nullable_boolean(self): - # GH + # GH50047 arr = np.array([True, False], dtype=object) exp = np.array([True, False]) out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) @@ -944,7 +944,7 @@ def test_maybe_convert_objects_nullable_boolean(self): @pytest.mark.parametrize("val", [None, np.nan]) def test_maybe_convert_objects_nullable_boolean_na(self, val): - # GH + # GH50047 arr = np.array([True, False, val], dtype=object) exp = BooleanArray( np.array([True, False, False]), np.array([False, False, True]) From 62c798f664545b9250dba58248f6d0f915d0621d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 23:00:26 +0100 Subject: [PATCH 4/5] Fix test --- pandas/tests/dtypes/test_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b075718a678d6..c9b61afb5eb25 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -869,7 +869,7 @@ def test_maybe_convert_objects_nullable_integer(self, exp): def test_maybe_convert_objects_nullable_none(self, dtype, val): # GH#50043 arr = np.array([val, None, 3], dtype="object") - result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) + result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) expected = IntegerArray( np.array([val, 0, 3], dtype=dtype), np.array([False, True, False]) ) From 85c995ae9f08284e11a352cc9f65b251d29aaf9c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 01:30:39 +0100 Subject: [PATCH 5/5] Simplify --- pandas/_libs/lib.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 726fe79c2d702..462537af3383a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1309,16 +1309,12 @@ cdef class Seen: @property def is_bool(self): # i.e. not (anything but bool) - return not ( - self.datetime_ or self.datetimetz_ or self.timedelta_ or self.nat_ - or self.period_ or self.interval_ - or self.numeric_ or self.nan_ or self.null_ or self.object_ - ) + return self.is_bool_or_na and not (self.nan_ or self.null_) @property def is_bool_or_na(self): # i.e. not (anything but bool or missing values) - return not ( + return self.bool_ and not ( self.datetime_ or self.datetimetz_ or self.nat_ or self.timedelta_ or self.period_ or self.interval_ or self.numeric_ or self.object_ )