Skip to content

ENH: Nullable integer/boolean/floating support in lib inferencing functions #40914

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 5, 2021
Merged
16 changes: 15 additions & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ from typing import (
Any,
Callable,
Generator,
Literal,
overload,
)

import numpy as np
Expand Down Expand Up @@ -70,12 +72,24 @@ def maybe_convert_objects(
convert_to_nullable_integer: bool = False,
) -> ArrayLike: ...

@overload
def maybe_convert_numeric(
values: np.ndarray, # np.ndarray[object]
na_values: set,
convert_empty: bool = True,
coerce_numeric: bool = False,
) -> np.ndarray: ...
convert_to_masked_nullable: Literal[False] = ...,
) -> tuple[np.ndarray, None]: ...

@overload
def maybe_convert_numeric(
values: np.ndarray, # np.ndarray[object]
na_values: set,
convert_empty: bool = True,
coerce_numeric: bool = False,
*,
convert_to_masked_nullable: Literal[True],
) -> tuple[np.ndarray, np.ndarray]: ...

# TODO: restrict `arr`?
def ensure_string_array(
Expand Down
83 changes: 66 additions & 17 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2029,7 +2029,8 @@ def maybe_convert_numeric(
set na_values,
bint convert_empty=True,
bint coerce_numeric=False,
) -> ndarray:
bint convert_to_masked_nullable=False,
) -> tuple[np.ndarray, np.ndarray | None]:
"""
Convert object array to a numeric array if possible.

Expand All @@ -2053,14 +2054,20 @@ def maybe_convert_numeric(
numeric array has no suitable numerical dtype to return (i.e. uint64,
int32, uint8). If set to False, the original object array will be
returned. Otherwise, a ValueError will be raised.

convert_to_masked_nullable : bool, default False
Whether to return a mask for the converted values. This also disables
upcasting for ints with nulls to float64.
Returns
-------
np.ndarray
Array of converted object values to numerical ones.

Optional[np.ndarray]
If convert_to_masked_nullable is True,
returns a boolean mask for the converted values, otherwise returns None.
"""
if len(values) == 0:
return np.array([], dtype='i8')
return (np.array([], dtype='i8'), None)

# fastpath for ints - try to convert all based on first value
cdef:
Expand All @@ -2070,7 +2077,7 @@ def maybe_convert_numeric(
try:
maybe_ints = values.astype('i8')
if (maybe_ints == values).all():
return maybe_ints
return (maybe_ints, None)
except (ValueError, OverflowError, TypeError):
pass

Expand All @@ -2084,21 +2091,40 @@ def maybe_convert_numeric(
ndarray[int64_t] ints = np.empty(n, dtype='i8')
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
float64_t fval
bint allow_null_in_int = convert_to_masked_nullable

for i in range(n):
val = values[i]
# We only want to disable NaNs showing as float if
# a) convert_to_masked_nullable = True
# b) no floats have been seen ( assuming an int shows up later )
# However, if no ints present (all null array), we need to return floats
allow_null_in_int = convert_to_masked_nullable and not seen.float_

if val.__hash__ is not None and val in na_values:
seen.saw_null()
if allow_null_in_int:
seen.null_ = True
mask[i] = 1
else:
if convert_to_masked_nullable:
mask[i] = 1
seen.saw_null()
floats[i] = complexes[i] = NaN
elif util.is_float_object(val):
fval = val
if fval != fval:
seen.null_ = True

if allow_null_in_int:
mask[i] = 1
else:
if convert_to_masked_nullable:
mask[i] = 1
seen.float_ = True
else:
seen.float_ = True
floats[i] = complexes[i] = fval
seen.float_ = True
elif util.is_integer_object(val):
floats[i] = complexes[i] = val

Expand All @@ -2121,7 +2147,13 @@ def maybe_convert_numeric(
floats[i] = uints[i] = ints[i] = bools[i] = val
seen.bool_ = True
elif val is None or val is C_NA:
seen.saw_null()
if allow_null_in_int:
seen.null_ = True
mask[i] = 1
else:
if convert_to_masked_nullable:
mask[i] = 1
seen.saw_null()
floats[i] = complexes[i] = NaN
elif hasattr(val, '__len__') and len(val) == 0:
if convert_empty or seen.coerce_numeric:
Expand All @@ -2142,17 +2174,22 @@ def maybe_convert_numeric(
if fval in na_values:
seen.saw_null()
floats[i] = complexes[i] = NaN
mask[i] = 1
else:
if fval != fval:
seen.null_ = True
mask[i] = 1

floats[i] = fval

if maybe_int:
as_int = int(val)

if as_int in na_values:
seen.saw_null()
mask[i] = 1
seen.null_ = True
if not allow_null_in_int:
seen.float_ = True
else:
seen.saw_int(as_int)

Expand Down Expand Up @@ -2180,22 +2217,34 @@ def maybe_convert_numeric(
floats[i] = NaN

if seen.check_uint64_conflict():
return values
return (values, None)

# This occurs since we disabled float nulls showing as null in anticipation
# of seeing ints that were never seen. So then, we return float
if allow_null_in_int and seen.null_ and not seen.int_:
seen.float_ = True

if seen.complex_:
return complexes
return (complexes, None)
elif seen.float_:
return floats
if seen.null_ and convert_to_masked_nullable:
return (floats, mask.view(np.bool_))
return (floats, None)
elif seen.int_:
if seen.null_ and convert_to_masked_nullable:
if seen.uint_:
return (uints, mask.view(np.bool_))
else:
return (ints, mask.view(np.bool_))
if seen.uint_:
return uints
return (uints, None)
else:
return ints
return (ints, None)
elif seen.bool_:
return bools.view(np.bool_)
return (bools.view(np.bool_), None)
elif seen.uint_:
return uints
return ints
return (uints, None)
return (ints, None)


@cython.boundscheck(False)
Expand Down
16 changes: 14 additions & 2 deletions pandas/_libs/ops.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import (
Any,
Callable,
Literal,
overload,
)

import numpy as np
Expand Down Expand Up @@ -35,9 +37,19 @@ def vec_binop(
op: _BinOp, # binary operator
) -> np.ndarray: ...

@overload
def maybe_convert_bool(
arr: np.ndarray, # np.ndarray[object]
true_values=...,
false_values=...,
convert_to_masked_nullable: Literal[False] = ...,
) -> tuple[np.ndarray, None]: ...

@overload
def maybe_convert_bool(
arr: np.ndarray, # np.ndarray[object]
true_values=...,
false_values=...
) -> np.ndarray: ...
false_values=...,
*,
convert_to_masked_nullable: Literal[True],
) -> tuple[np.ndarray, np.ndarray]: ...
42 changes: 23 additions & 19 deletions pandas/_libs/ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ import_array()


from pandas._libs.missing cimport checknull
from pandas._libs.util cimport (
UINT8_MAX,
is_nan,
)
from pandas._libs.util cimport is_nan


@cython.wraparound(False)
Expand Down Expand Up @@ -212,7 +209,7 @@ def scalar_binop(object[:] values, object val, object op) -> ndarray:
else:
result[i] = op(x, val)

return maybe_convert_bool(result.base)
return maybe_convert_bool(result.base)[0]


@cython.wraparound(False)
Expand Down Expand Up @@ -254,21 +251,25 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
else:
raise

return maybe_convert_bool(result.base) # `.base` to access np.ndarray
return maybe_convert_bool(result.base)[0] # `.base` to access np.ndarray


def maybe_convert_bool(ndarray[object] arr,
true_values=None, false_values=None) -> ndarray:
true_values=None,
false_values=None,
convert_to_masked_nullable=False
) -> tuple[np.ndarray, np.ndarray | None]:
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
ndarray[uint8_t] mask
object val
set true_vals, false_vals
int na_count = 0
bint has_na = False

n = len(arr)
result = np.empty(n, dtype=np.uint8)

mask = np.zeros(n, dtype=np.uint8)
# the defaults
true_vals = {'True', 'TRUE', 'true'}
false_vals = {'False', 'FALSE', 'false'}
Expand All @@ -291,16 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
result[i] = 1
elif val in false_vals:
result[i] = 0
elif isinstance(val, float):
result[i] = UINT8_MAX
na_count += 1
elif is_nan(val):
mask[i] = 1
result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
has_na = True
else:
return arr
return (arr, None)

if na_count > 0:
mask = result == UINT8_MAX
arr = result.view(np.bool_).astype(object)
np.putmask(arr, mask, np.nan)
return arr
if has_na:
if convert_to_masked_nullable:
return (result.view(np.bool_), mask.view(np.bool_))
else:
arr = result.view(np.bool_).astype(object)
np.putmask(arr, mask, np.nan)
return (arr, None)
else:
return result.view(np.bool_)
return (result.view(np.bool_), None)
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1356,7 +1356,7 @@ def soft_convert_objects(
return converted

if numeric and is_object_dtype(values.dtype):
converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
converted, _ = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)

# If all NaNs, then do not-alter
values = converted if not isna(converted).all() else values
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def to_numeric(arg, errors="raise", downcast=None):
values = ensure_object(values)
coerce_numeric = errors not in ("ignore", "raise")
try:
values = lib.maybe_convert_numeric(
values, _ = lib.maybe_convert_numeric(
values, set(), coerce_numeric=coerce_numeric
)
except (ValueError, TypeError):
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
if try_num_bool and is_object_dtype(values.dtype):
# exclude e.g DatetimeIndex here
try:
result = lib.maybe_convert_numeric(values, na_values, False)
result, _ = lib.maybe_convert_numeric(values, na_values, False)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be addressed in #40687, the main PR.

except (ValueError, TypeError):
# e.g. encountering datetime string gets ValueError
# TypeError can be raised in floatify
Expand All @@ -690,7 +690,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
na_count = parsers.sanitize_objects(values, na_values, False)

if result.dtype == np.object_ and try_num_bool:
result = libops.maybe_convert_bool(
result, _ = libops.maybe_convert_bool(
np.asarray(values),
true_values=self.true_values,
false_values=self.false_values,
Expand Down
Loading