ENH: Nullable integer/boolean/floating support in lib inferencing functions (#40914)

lithomas1 · web-flow · commit 4d73a34cdb41 · 2021-05-05T08:46:38.000-04:00
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -114,12 +114,24 @@ def maybe_convert_objects(
     convert_to_nullable_integer: bool = ...,
 ) -> ArrayLike: ...
 
+@overload
 def maybe_convert_numeric(
     values: np.ndarray,  # np.ndarray[object]
     na_values: set,
     convert_empty: bool = True,
     coerce_numeric: bool = False,
-) -> np.ndarray: ...
+    convert_to_masked_nullable: Literal[False] = ...,
+) -> tuple[np.ndarray, None]: ...
+
+@overload
+def maybe_convert_numeric(
+    values: np.ndarray,  # np.ndarray[object]
+    na_values: set,
+    convert_empty: bool = True,
+    coerce_numeric: bool = False,
+    *,
+    convert_to_masked_nullable: Literal[True],
+) -> tuple[np.ndarray, np.ndarray]: ...
 
 # TODO: restrict `arr`?
 def ensure_string_array(
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2029,7 +2029,8 @@ def maybe_convert_numeric(
     set na_values,
     bint convert_empty=True,
     bint coerce_numeric=False,
-) -> ndarray:
+    bint convert_to_masked_nullable=False,
+) -> tuple[np.ndarray, np.ndarray | None]:
     """
     Convert object array to a numeric array if possible.
 
@@ -2053,14 +2054,20 @@ def maybe_convert_numeric(
         numeric array has no suitable numerical dtype to return (i.e. uint64,
         int32, uint8). If set to False, the original object array will be
         returned. Otherwise, a ValueError will be raised.
-
+    convert_to_masked_nullable : bool, default False
+        Whether to return a mask for the converted values. This also disables
+        upcasting for ints with nulls to float64.
     Returns
     -------
     np.ndarray
         Array of converted object values to numerical ones.
+
+    Optional[np.ndarray]
+        If convert_to_masked_nullable is True,
+        returns a boolean mask for the converted values, otherwise returns None.
     """
     if len(values) == 0:
-        return np.array([], dtype='i8')
+        return (np.array([], dtype='i8'), None)
 
     # fastpath for ints - try to convert all based on first value
     cdef:
@@ -2070,7 +2077,7 @@ def maybe_convert_numeric(
         try:
             maybe_ints = values.astype('i8')
             if (maybe_ints == values).all():
-                return maybe_ints
+                return (maybe_ints, None)
         except (ValueError, OverflowError, TypeError):
             pass
 
@@ -2084,21 +2091,40 @@ def maybe_convert_numeric(
         ndarray[int64_t] ints = np.empty(n, dtype='i8')
         ndarray[uint64_t] uints = np.empty(n, dtype='u8')
         ndarray[uint8_t] bools = np.empty(n, dtype='u1')
+        ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
         float64_t fval
+        bint allow_null_in_int = convert_to_masked_nullable
 
     for i in range(n):
         val = values[i]
+        # We only want to disable NaNs showing as float if
+        # a) convert_to_masked_nullable = True
+        # b) no floats have been seen ( assuming an int shows up later )
+        # However, if no ints present (all null array), we need to return floats
+        allow_null_in_int = convert_to_masked_nullable and not seen.float_
 
         if val.__hash__ is not None and val in na_values:
-            seen.saw_null()
+            if allow_null_in_int:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                if convert_to_masked_nullable:
+                    mask[i] = 1
+                seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif util.is_float_object(val):
             fval = val
             if fval != fval:
                 seen.null_ = True
-
+                if allow_null_in_int:
+                    mask[i] = 1
+                else:
+                    if convert_to_masked_nullable:
+                        mask[i] = 1
+                    seen.float_ = True
+            else:
+                seen.float_ = True
             floats[i] = complexes[i] = fval
-            seen.float_ = True
         elif util.is_integer_object(val):
             floats[i] = complexes[i] = val
 
@@ -2121,7 +2147,13 @@ def maybe_convert_numeric(
             floats[i] = uints[i] = ints[i] = bools[i] = val
             seen.bool_ = True
         elif val is None or val is C_NA:
-            seen.saw_null()
+            if allow_null_in_int:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                if convert_to_masked_nullable:
+                    mask[i] = 1
+                seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif hasattr(val, '__len__') and len(val) == 0:
             if convert_empty or seen.coerce_numeric:
@@ -2142,17 +2174,22 @@ def maybe_convert_numeric(
                 if fval in na_values:
                     seen.saw_null()
                     floats[i] = complexes[i] = NaN
+                    mask[i] = 1
                 else:
                     if fval != fval:
                         seen.null_ = True
+                        mask[i] = 1
 
                     floats[i] = fval
 
                 if maybe_int:
                     as_int = int(val)
 
                     if as_int in na_values:
-                        seen.saw_null()
+                        mask[i] = 1
+                        seen.null_ = True
+                        if not allow_null_in_int:
+                            seen.float_ = True
                     else:
                         seen.saw_int(as_int)
 
@@ -2180,22 +2217,34 @@ def maybe_convert_numeric(
                 floats[i] = NaN
 
     if seen.check_uint64_conflict():
-        return values
+        return (values, None)
+
+    # This occurs since we disabled float nulls showing as null in anticipation
+    # of seeing ints that were never seen. So then, we return float
+    if allow_null_in_int and seen.null_ and not seen.int_:
+        seen.float_ = True
 
     if seen.complex_:
-        return complexes
+        return (complexes, None)
     elif seen.float_:
-        return floats
+        if seen.null_ and convert_to_masked_nullable:
+            return (floats, mask.view(np.bool_))
+        return (floats, None)
     elif seen.int_:
+        if seen.null_ and convert_to_masked_nullable:
+            if seen.uint_:
+                return (uints, mask.view(np.bool_))
+            else:
+                return (ints, mask.view(np.bool_))
         if seen.uint_:
-            return uints
+            return (uints, None)
         else:
-            return ints
+            return (ints, None)
     elif seen.bool_:
-        return bools.view(np.bool_)
+        return (bools.view(np.bool_), None)
     elif seen.uint_:
-        return uints
-    return ints
+        return (uints, None)
+    return (ints, None)
 
 
 @cython.boundscheck(False)
diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
@@ -1,6 +1,8 @@
 from typing import (
     Any,
     Callable,
+    Literal,
+    overload,
 )
 
 import numpy as np
@@ -35,9 +37,19 @@ def vec_binop(
     op: _BinOp,         # binary operator
 ) -> np.ndarray: ...
 
+@overload
+def maybe_convert_bool(
+    arr: np.ndarray,  # np.ndarray[object]
+    true_values=...,
+    false_values=...,
+    convert_to_masked_nullable: Literal[False] = ...,
+) -> tuple[np.ndarray, None]: ...
 
+@overload
 def maybe_convert_bool(
     arr: np.ndarray,  # np.ndarray[object]
     true_values=...,
-    false_values=...
-) -> np.ndarray: ...
+    false_values=...,
+    *,
+    convert_to_masked_nullable: Literal[True],
+) -> tuple[np.ndarray, np.ndarray]: ...
diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx
@@ -24,10 +24,7 @@ import_array()
 
 
 from pandas._libs.missing cimport checknull
-from pandas._libs.util cimport (
-    UINT8_MAX,
-    is_nan,
-)
+from pandas._libs.util cimport is_nan
 
 
 @cython.wraparound(False)
@@ -212,7 +209,7 @@ def scalar_binop(object[:] values, object val, object op) -> ndarray:
         else:
             result[i] = op(x, val)
 
-    return maybe_convert_bool(result.base)
+    return maybe_convert_bool(result.base)[0]
 
 
 @cython.wraparound(False)
@@ -254,21 +251,25 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
             else:
                 raise
 
-    return maybe_convert_bool(result.base)  # `.base` to access np.ndarray
+    return maybe_convert_bool(result.base)[0]  # `.base` to access np.ndarray
 
 
 def maybe_convert_bool(ndarray[object] arr,
-                       true_values=None, false_values=None) -> ndarray:
+                       true_values=None,
+                       false_values=None,
+                       convert_to_masked_nullable=False
+                       ) -> tuple[np.ndarray, np.ndarray | None]:
     cdef:
         Py_ssize_t i, n
         ndarray[uint8_t] result
+        ndarray[uint8_t] mask
         object val
         set true_vals, false_vals
-        int na_count = 0
+        bint has_na = False
 
     n = len(arr)
     result = np.empty(n, dtype=np.uint8)
-
+    mask = np.zeros(n, dtype=np.uint8)
     # the defaults
     true_vals = {'True', 'TRUE', 'true'}
     false_vals = {'False', 'FALSE', 'false'}
@@ -291,16 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
             result[i] = 1
         elif val in false_vals:
             result[i] = 0
-        elif isinstance(val, float):
-            result[i] = UINT8_MAX
-            na_count += 1
+        elif is_nan(val):
+            mask[i] = 1
+            result[i] = 0  # Value here doesn't matter, will be replaced w/ nan
+            has_na = True
         else:
-            return arr
+            return (arr, None)
 
-    if na_count > 0:
-        mask = result == UINT8_MAX
-        arr = result.view(np.bool_).astype(object)
-        np.putmask(arr, mask, np.nan)
-        return arr
+    if has_na:
+        if convert_to_masked_nullable:
+            return (result.view(np.bool_), mask.view(np.bool_))
+        else:
+            arr = result.view(np.bool_).astype(object)
+            np.putmask(arr, mask, np.nan)
+            return (arr, None)
     else:
-        return result.view(np.bool_)
+        return (result.view(np.bool_), None)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1356,7 +1356,7 @@ def soft_convert_objects(
             return converted
 
     if numeric and is_object_dtype(values.dtype):
-        converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
+        converted, _ = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
 
         # If all NaNs, then do not-alter
         values = converted if not isna(converted).all() else values
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -180,7 +180,7 @@ def to_numeric(arg, errors="raise", downcast=None):
         values = ensure_object(values)
         coerce_numeric = errors not in ("ignore", "raise")
         try:
-            values = lib.maybe_convert_numeric(
+            values, _ = lib.maybe_convert_numeric(
                 values, set(), coerce_numeric=coerce_numeric
             )
         except (ValueError, TypeError):
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -676,7 +676,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
         if try_num_bool and is_object_dtype(values.dtype):
             # exclude e.g DatetimeIndex here
             try:
-                result = lib.maybe_convert_numeric(values, na_values, False)
+                result, _ = lib.maybe_convert_numeric(values, na_values, False)
             except (ValueError, TypeError):
                 # e.g. encountering datetime string gets ValueError
                 #  TypeError can be raised in floatify
@@ -690,7 +690,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
                 na_count = parsers.sanitize_objects(values, na_values, False)
 
         if result.dtype == np.object_ and try_num_bool:
-            result = libops.maybe_convert_bool(
+            result, _ = libops.maybe_convert_bool(
                 np.asarray(values),
                 true_values=self.true_values,
                 false_values=self.false_values,
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ def to_numeric(arg, errors="raise", downcast=None):`
`180`	`180`	`values = ensure_object(values)`
`181`	`181`	`coerce_numeric = errors not in ("ignore", "raise")`
`182`	`182`	`try:`
`183`		`- values = lib.maybe_convert_numeric(`
	`183`	`+ values, _ = lib.maybe_convert_numeric(`
`184`	`184`	`values, set(), coerce_numeric=coerce_numeric`
`185`	`185`	`)`
`186`	`186`	`except (ValueError, TypeError):`