Skip to content

Commit 4d73a34

Browse files
authored
ENH: Nullable integer/boolean/floating support in lib inferencing functions (#40914)
1 parent e0cc505 commit 4d73a34

File tree

8 files changed

+270
-64
lines changed

8 files changed

+270
-64
lines changed

pandas/_libs/lib.pyi

+13-1
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,24 @@ def maybe_convert_objects(
114114
convert_to_nullable_integer: bool = ...,
115115
) -> ArrayLike: ...
116116

117+
@overload
117118
def maybe_convert_numeric(
118119
values: np.ndarray, # np.ndarray[object]
119120
na_values: set,
120121
convert_empty: bool = True,
121122
coerce_numeric: bool = False,
122-
) -> np.ndarray: ...
123+
convert_to_masked_nullable: Literal[False] = ...,
124+
) -> tuple[np.ndarray, None]: ...
125+
126+
@overload
127+
def maybe_convert_numeric(
128+
values: np.ndarray, # np.ndarray[object]
129+
na_values: set,
130+
convert_empty: bool = True,
131+
coerce_numeric: bool = False,
132+
*,
133+
convert_to_masked_nullable: Literal[True],
134+
) -> tuple[np.ndarray, np.ndarray]: ...
123135

124136
# TODO: restrict `arr`?
125137
def ensure_string_array(

pandas/_libs/lib.pyx

+66-17
Original file line numberDiff line numberDiff line change
@@ -2029,7 +2029,8 @@ def maybe_convert_numeric(
20292029
set na_values,
20302030
bint convert_empty=True,
20312031
bint coerce_numeric=False,
2032-
) -> ndarray:
2032+
bint convert_to_masked_nullable=False,
2033+
) -> tuple[np.ndarray, np.ndarray | None]:
20332034
"""
20342035
Convert object array to a numeric array if possible.
20352036

@@ -2053,14 +2054,20 @@ def maybe_convert_numeric(
20532054
numeric array has no suitable numerical dtype to return (i.e. uint64,
20542055
int32, uint8). If set to False, the original object array will be
20552056
returned. Otherwise, a ValueError will be raised.
2056-
2057+
convert_to_masked_nullable : bool, default False
2058+
Whether to return a mask for the converted values. This also disables
2059+
upcasting for ints with nulls to float64.
20572060
Returns
20582061
-------
20592062
np.ndarray
20602063
Array of converted object values to numerical ones.
2064+
2065+
Optional[np.ndarray]
2066+
If convert_to_masked_nullable is True,
2067+
returns a boolean mask for the converted values, otherwise returns None.
20612068
"""
20622069
if len(values) == 0:
2063-
return np.array([], dtype='i8')
2070+
return (np.array([], dtype='i8'), None)
20642071

20652072
# fastpath for ints - try to convert all based on first value
20662073
cdef:
@@ -2070,7 +2077,7 @@ def maybe_convert_numeric(
20702077
try:
20712078
maybe_ints = values.astype('i8')
20722079
if (maybe_ints == values).all():
2073-
return maybe_ints
2080+
return (maybe_ints, None)
20742081
except (ValueError, OverflowError, TypeError):
20752082
pass
20762083

@@ -2084,21 +2091,40 @@ def maybe_convert_numeric(
20842091
ndarray[int64_t] ints = np.empty(n, dtype='i8')
20852092
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
20862093
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
2094+
ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
20872095
float64_t fval
2096+
bint allow_null_in_int = convert_to_masked_nullable
20882097

20892098
for i in range(n):
20902099
val = values[i]
2100+
# We only want to disable NaNs showing as float if
2101+
# a) convert_to_masked_nullable = True
2102+
# b) no floats have been seen ( assuming an int shows up later )
2103+
# However, if no ints present (all null array), we need to return floats
2104+
allow_null_in_int = convert_to_masked_nullable and not seen.float_
20912105

20922106
if val.__hash__ is not None and val in na_values:
2093-
seen.saw_null()
2107+
if allow_null_in_int:
2108+
seen.null_ = True
2109+
mask[i] = 1
2110+
else:
2111+
if convert_to_masked_nullable:
2112+
mask[i] = 1
2113+
seen.saw_null()
20942114
floats[i] = complexes[i] = NaN
20952115
elif util.is_float_object(val):
20962116
fval = val
20972117
if fval != fval:
20982118
seen.null_ = True
2099-
2119+
if allow_null_in_int:
2120+
mask[i] = 1
2121+
else:
2122+
if convert_to_masked_nullable:
2123+
mask[i] = 1
2124+
seen.float_ = True
2125+
else:
2126+
seen.float_ = True
21002127
floats[i] = complexes[i] = fval
2101-
seen.float_ = True
21022128
elif util.is_integer_object(val):
21032129
floats[i] = complexes[i] = val
21042130

@@ -2121,7 +2147,13 @@ def maybe_convert_numeric(
21212147
floats[i] = uints[i] = ints[i] = bools[i] = val
21222148
seen.bool_ = True
21232149
elif val is None or val is C_NA:
2124-
seen.saw_null()
2150+
if allow_null_in_int:
2151+
seen.null_ = True
2152+
mask[i] = 1
2153+
else:
2154+
if convert_to_masked_nullable:
2155+
mask[i] = 1
2156+
seen.saw_null()
21252157
floats[i] = complexes[i] = NaN
21262158
elif hasattr(val, '__len__') and len(val) == 0:
21272159
if convert_empty or seen.coerce_numeric:
@@ -2142,17 +2174,22 @@ def maybe_convert_numeric(
21422174
if fval in na_values:
21432175
seen.saw_null()
21442176
floats[i] = complexes[i] = NaN
2177+
mask[i] = 1
21452178
else:
21462179
if fval != fval:
21472180
seen.null_ = True
2181+
mask[i] = 1
21482182

21492183
floats[i] = fval
21502184

21512185
if maybe_int:
21522186
as_int = int(val)
21532187

21542188
if as_int in na_values:
2155-
seen.saw_null()
2189+
mask[i] = 1
2190+
seen.null_ = True
2191+
if not allow_null_in_int:
2192+
seen.float_ = True
21562193
else:
21572194
seen.saw_int(as_int)
21582195

@@ -2180,22 +2217,34 @@ def maybe_convert_numeric(
21802217
floats[i] = NaN
21812218

21822219
if seen.check_uint64_conflict():
2183-
return values
2220+
return (values, None)
2221+
2222+
# This occurs since we disabled float nulls showing as null in anticipation
2223+
# of seeing ints that were never seen. So then, we return float
2224+
if allow_null_in_int and seen.null_ and not seen.int_:
2225+
seen.float_ = True
21842226

21852227
if seen.complex_:
2186-
return complexes
2228+
return (complexes, None)
21872229
elif seen.float_:
2188-
return floats
2230+
if seen.null_ and convert_to_masked_nullable:
2231+
return (floats, mask.view(np.bool_))
2232+
return (floats, None)
21892233
elif seen.int_:
2234+
if seen.null_ and convert_to_masked_nullable:
2235+
if seen.uint_:
2236+
return (uints, mask.view(np.bool_))
2237+
else:
2238+
return (ints, mask.view(np.bool_))
21902239
if seen.uint_:
2191-
return uints
2240+
return (uints, None)
21922241
else:
2193-
return ints
2242+
return (ints, None)
21942243
elif seen.bool_:
2195-
return bools.view(np.bool_)
2244+
return (bools.view(np.bool_), None)
21962245
elif seen.uint_:
2197-
return uints
2198-
return ints
2246+
return (uints, None)
2247+
return (ints, None)
21992248

22002249

22012250
@cython.boundscheck(False)

pandas/_libs/ops.pyi

+14-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from typing import (
22
Any,
33
Callable,
4+
Literal,
5+
overload,
46
)
57

68
import numpy as np
@@ -35,9 +37,19 @@ def vec_binop(
3537
op: _BinOp, # binary operator
3638
) -> np.ndarray: ...
3739

40+
@overload
41+
def maybe_convert_bool(
42+
arr: np.ndarray, # np.ndarray[object]
43+
true_values=...,
44+
false_values=...,
45+
convert_to_masked_nullable: Literal[False] = ...,
46+
) -> tuple[np.ndarray, None]: ...
3847

48+
@overload
3949
def maybe_convert_bool(
4050
arr: np.ndarray, # np.ndarray[object]
4151
true_values=...,
42-
false_values=...
43-
) -> np.ndarray: ...
52+
false_values=...,
53+
*,
54+
convert_to_masked_nullable: Literal[True],
55+
) -> tuple[np.ndarray, np.ndarray]: ...

pandas/_libs/ops.pyx

+23-19
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ import_array()
2424

2525

2626
from pandas._libs.missing cimport checknull
27-
from pandas._libs.util cimport (
28-
UINT8_MAX,
29-
is_nan,
30-
)
27+
from pandas._libs.util cimport is_nan
3128

3229

3330
@cython.wraparound(False)
@@ -212,7 +209,7 @@ def scalar_binop(object[:] values, object val, object op) -> ndarray:
212209
else:
213210
result[i] = op(x, val)
214211

215-
return maybe_convert_bool(result.base)
212+
return maybe_convert_bool(result.base)[0]
216213

217214

218215
@cython.wraparound(False)
@@ -254,21 +251,25 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
254251
else:
255252
raise
256253

257-
return maybe_convert_bool(result.base) # `.base` to access np.ndarray
254+
return maybe_convert_bool(result.base)[0] # `.base` to access np.ndarray
258255

259256

260257
def maybe_convert_bool(ndarray[object] arr,
261-
true_values=None, false_values=None) -> ndarray:
258+
true_values=None,
259+
false_values=None,
260+
convert_to_masked_nullable=False
261+
) -> tuple[np.ndarray, np.ndarray | None]:
262262
cdef:
263263
Py_ssize_t i, n
264264
ndarray[uint8_t] result
265+
ndarray[uint8_t] mask
265266
object val
266267
set true_vals, false_vals
267-
int na_count = 0
268+
bint has_na = False
268269

269270
n = len(arr)
270271
result = np.empty(n, dtype=np.uint8)
271-
272+
mask = np.zeros(n, dtype=np.uint8)
272273
# the defaults
273274
true_vals = {'True', 'TRUE', 'true'}
274275
false_vals = {'False', 'FALSE', 'false'}
@@ -291,16 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
291292
result[i] = 1
292293
elif val in false_vals:
293294
result[i] = 0
294-
elif isinstance(val, float):
295-
result[i] = UINT8_MAX
296-
na_count += 1
295+
elif is_nan(val):
296+
mask[i] = 1
297+
result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
298+
has_na = True
297299
else:
298-
return arr
300+
return (arr, None)
299301

300-
if na_count > 0:
301-
mask = result == UINT8_MAX
302-
arr = result.view(np.bool_).astype(object)
303-
np.putmask(arr, mask, np.nan)
304-
return arr
302+
if has_na:
303+
if convert_to_masked_nullable:
304+
return (result.view(np.bool_), mask.view(np.bool_))
305+
else:
306+
arr = result.view(np.bool_).astype(object)
307+
np.putmask(arr, mask, np.nan)
308+
return (arr, None)
305309
else:
306-
return result.view(np.bool_)
310+
return (result.view(np.bool_), None)

pandas/core/dtypes/cast.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1356,7 +1356,7 @@ def soft_convert_objects(
13561356
return converted
13571357

13581358
if numeric and is_object_dtype(values.dtype):
1359-
converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
1359+
converted, _ = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
13601360

13611361
# If all NaNs, then do not-alter
13621362
values = converted if not isna(converted).all() else values

pandas/core/tools/numeric.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def to_numeric(arg, errors="raise", downcast=None):
180180
values = ensure_object(values)
181181
coerce_numeric = errors not in ("ignore", "raise")
182182
try:
183-
values = lib.maybe_convert_numeric(
183+
values, _ = lib.maybe_convert_numeric(
184184
values, set(), coerce_numeric=coerce_numeric
185185
)
186186
except (ValueError, TypeError):

pandas/io/parsers/base_parser.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
676676
if try_num_bool and is_object_dtype(values.dtype):
677677
# exclude e.g DatetimeIndex here
678678
try:
679-
result = lib.maybe_convert_numeric(values, na_values, False)
679+
result, _ = lib.maybe_convert_numeric(values, na_values, False)
680680
except (ValueError, TypeError):
681681
# e.g. encountering datetime string gets ValueError
682682
# TypeError can be raised in floatify
@@ -690,7 +690,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
690690
na_count = parsers.sanitize_objects(values, na_values, False)
691691

692692
if result.dtype == np.object_ and try_num_bool:
693-
result = libops.maybe_convert_bool(
693+
result, _ = libops.maybe_convert_bool(
694694
np.asarray(values),
695695
true_values=self.true_values,
696696
false_values=self.false_values,

0 commit comments

Comments
 (0)