Skip to content

ENH: Nullable integer/boolean/floating support in lib inferencing functions #40914

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 5, 2021
Merged
2 changes: 2 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def maybe_convert_numeric(
na_values: set,
convert_empty: bool = True,
coerce_numeric: bool = False,
convert_to_nullable_integer: bool = False,
convert_to_floating_array: bool = False
) -> np.ndarray: ...

# TODO: restrict `arr`?
Expand Down
65 changes: 57 additions & 8 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2007,7 +2007,9 @@ def maybe_convert_numeric(
set na_values,
bint convert_empty=True,
bint coerce_numeric=False,
) -> ndarray:
bint convert_to_nullable_integer=False,
bint convert_to_floating_array=False,
) -> "ArrayLike":
"""
Convert object array to a numeric array if possible.

Expand All @@ -2031,10 +2033,15 @@ def maybe_convert_numeric(
numeric array has no suitable numerical dtype to return (i.e. uint64,
int32, uint8). If set to False, the original object array will be
returned. Otherwise, a ValueError will be raised.

convert_to_nullable_integer : bool, default False
If an array-like object contains only integer values (and NaN) is
encountered, whether to convert and return an IntegerArray.
convert_to_floating_array : bool, default False
If an array-like object contains only float values (and NaN) is
encountered, whether to convert and return an FloatingArray.
Returns
-------
np.ndarray
np.ndarray or ExtensionArray
Array of converted object values to numerical ones.
"""
if len(values) == 0:
Expand Down Expand Up @@ -2062,21 +2069,39 @@ def maybe_convert_numeric(
ndarray[int64_t] ints = np.empty(n, dtype='i8')
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
float64_t fval

for i in range(n):
val = values[i]
# We only want to disable NaNs showing as float if
# a) convert_to_nullable_integer = True
# b) no floats have been seen ( assuming an int shows up later )
# However, if no ints present (all null array), we need to return floats
allow_null_in_int = convert_to_nullable_integer and not seen.float_

if val.__hash__ is not None and val in na_values:
seen.saw_null()
if allow_null_in_int:
seen.null_ = True
mask[i] = 1
else:
if convert_to_floating_array:
mask[i] = 1
seen.saw_null()
floats[i] = complexes[i] = NaN
elif util.is_float_object(val):
fval = val
if fval != fval:
seen.null_ = True

if allow_null_in_int:
mask[i] = 1
else:
if convert_to_floating_array:
mask[i] = 1
seen.float_ = True
else:
seen.float_ = True
floats[i] = complexes[i] = fval
seen.float_ = True
elif util.is_integer_object(val):
floats[i] = complexes[i] = val

Expand All @@ -2099,7 +2124,13 @@ def maybe_convert_numeric(
floats[i] = uints[i] = ints[i] = bools[i] = val
seen.bool_ = True
elif val is None or val is C_NA:
seen.saw_null()
if allow_null_in_int:
seen.null_ = True
mask[i] = 1
else:
if convert_to_floating_array:
mask[i] = 1
seen.saw_null()
floats[i] = complexes[i] = NaN
elif hasattr(val, '__len__') and len(val) == 0:
if convert_empty or seen.coerce_numeric:
Expand All @@ -2123,14 +2154,18 @@ def maybe_convert_numeric(
else:
if fval != fval:
seen.null_ = True
mask[i] = 1

floats[i] = fval

if maybe_int:
as_int = int(val)

if as_int in na_values:
seen.saw_null()
mask[i] = 1
seen.null_ = True
if not convert_to_nullable_integer:
seen.float_ = True
else:
seen.saw_int(as_int)

Expand Down Expand Up @@ -2160,11 +2195,25 @@ def maybe_convert_numeric(
if seen.check_uint64_conflict():
return values

# This occurs since we disabled float nulls showing as null in anticipation
# of seeing ints that were never seen. So then, we return float
if convert_to_nullable_integer and seen.null_ and not seen.int_:
seen.float_ = True

if seen.complex_:
return complexes
elif seen.float_:
if seen.null_ and convert_to_floating_array:
from pandas.core.arrays import FloatingArray
return FloatingArray(floats, mask.view(np.bool_))
return floats
elif seen.int_:
if seen.null_ and convert_to_nullable_integer:
from pandas.core.arrays import IntegerArray
if seen.uint_:
return IntegerArray(uints, mask.view(np.bool_))
else:
return IntegerArray(ints, mask.view(np.bool_))
if seen.uint_:
return uints
else:
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/ops.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,6 @@ def vec_binop(
def maybe_convert_bool(
arr: np.ndarray, # np.ndarray[object]
true_values=...,
false_values=...
false_values=...,
convert_to_nullable_boolean: bool = True,
) -> np.ndarray: ...
32 changes: 18 additions & 14 deletions pandas/_libs/ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ import_array()


from pandas._libs.missing cimport checknull
from pandas._libs.util cimport (
UINT8_MAX,
is_nan,
)
from pandas._libs.util cimport is_nan


@cython.wraparound(False)
Expand Down Expand Up @@ -258,17 +255,20 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:


def maybe_convert_bool(ndarray[object] arr,
true_values=None, false_values=None) -> ndarray:
true_values=None,
false_values=None,
convert_to_nullable_boolean=False) -> "ArrayLike":
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
ndarray[uint8_t] mask
object val
set true_vals, false_vals
int na_count = 0
bint has_na = False

n = len(arr)
result = np.empty(n, dtype=np.uint8)

mask = np.zeros(n, dtype=np.uint8)
# the defaults
true_vals = {'True', 'TRUE', 'true'}
false_vals = {'False', 'FALSE', 'false'}
Expand All @@ -292,15 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
elif val in false_vals:
result[i] = 0
elif isinstance(val, float):
result[i] = UINT8_MAX
na_count += 1
mask[i] = 1
result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
has_na = True
else:
return arr

if na_count > 0:
mask = result == UINT8_MAX
arr = result.view(np.bool_).astype(object)
np.putmask(arr, mask, np.nan)
return arr
if has_na:
if convert_to_nullable_boolean:
from pandas.core.arrays import BooleanArray
return BooleanArray(result.view(np.bool_), mask.view(np.bool_))
else:
arr = result.view(np.bool_).astype(object)
np.putmask(arr, mask, np.nan)
return arr
else:
return result.view(np.bool_)
66 changes: 65 additions & 1 deletion pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pandas._libs import (
lib,
missing as libmissing,
ops as libops,
)
import pandas.util._test_decorators as td

Expand Down Expand Up @@ -60,7 +61,11 @@
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import IntegerArray
from pandas.core.arrays import (
BooleanArray,
FloatingArray,
IntegerArray,
)


@pytest.fixture(params=[True, False], ids=str)
Expand Down Expand Up @@ -415,6 +420,29 @@ def test_isneginf_scalar(self, value, expected):
result = libmissing.isneginf_scalar(value)
assert result is expected

@pytest.mark.parametrize(
"convert_to_nullable_boolean, exp",
[
(
True,
BooleanArray(
np.array([True, False], dtype="bool"), np.array([False, True])
),
),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have tests for maybe_convert_bool already? can you integrate with those

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. maybe_convert_bool is only used by python parser.

(False, np.array([True, np.nan], dtype="object")),
],
)
def test_maybe_convert_nullable_boolean(self, convert_to_nullable_boolean, exp):
# GH 40687
arr = np.array([True, np.NaN], dtype=object)
result = libops.maybe_convert_bool(
arr, set(), convert_to_nullable_boolean=convert_to_nullable_boolean
)
if convert_to_nullable_boolean:
tm.assert_extension_array_equal(result, exp)
else:
tm.assert_numpy_array_equal(result, exp)

@pytest.mark.parametrize("coerce_numeric", [True, False])
@pytest.mark.parametrize(
"infinity", ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"]
Expand Down Expand Up @@ -607,6 +635,42 @@ def test_maybe_convert_objects_nullable_integer(self, exp):

tm.assert_extension_array_equal(result, exp)

@pytest.mark.parametrize(
"exp",
[
IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])),
IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])),
],
)
def test_maybe_convert_numeric_nullable_integer(self, exp):
# GH 40687
arr = np.array([2, np.NaN], dtype=object)
result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True)
tm.assert_extension_array_equal(result, exp)

@pytest.mark.parametrize(
"convert_to_floating_array, exp",
[
(
True,
FloatingArray(
np.array([2.0, 0.0], dtype="float64"), np.array([False, True])
),
),
(False, np.array([2.0, np.nan])),
],
)
def test_maybe_convert_numeric_floating_array(self, convert_to_floating_array, exp):
# GH 40687
arr = np.array([2, np.nan], dtype=object)
result = lib.maybe_convert_numeric(
arr, set(), convert_to_floating_array=convert_to_floating_array
)
if convert_to_floating_array:
tm.assert_extension_array_equal(result, exp)
else:
tm.assert_numpy_array_equal(result, exp)

def test_maybe_convert_objects_bool_nan(self):
# GH32146
ind = Index([True, False, np.nan], dtype=object)
Expand Down