pandas-dev · jreback · Dec 30, 2019 · Dec 2, 2019 · Dec 2, 2019 · Dec 2, 2019
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
@@ -15,14 +15,16 @@ Nullable integer data type
    IntegerArray is currently experimental. Its API or implementation may
    change without warning.
 
-
 In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent
 missing data. Because ``NaN`` is a float, this forces an array of integers with
 any missing values to become floating point. In some cases, this may not matter
 much. But if your integer column is, say, an identifier, casting to float can
 be problematic. Some integers cannot even be represented as floating point
 numbers.
 
+Construction
+------------
+
 Pandas can represent integer data with possibly missing values using
 :class:`arrays.IntegerArray`. This is an :ref:`extension types <extending.extension-types>`
 implemented within pandas.
@@ -39,6 +41,12 @@ NumPy's ``'int64'`` dtype:
 
    pd.array([1, 2, np.nan], dtype="Int64")
 
+All NA-like values are replaced with :attr:`pandas.NA`.
+
+.. ipython:: python
+
+   pd.array([1, 2, np.nan, None, pd.NA], dtype="Int64")
+
 This array can be stored in a :class:`DataFrame` or :class:`Series` like any
 NumPy array.
 
@@ -78,6 +86,9 @@ with the dtype.
    In the future, we may provide an option for :class:`Series` to infer a
    nullable-integer dtype.
 
+Operations
+----------
+
 Operations involving an integer array will behave similar to NumPy arrays.
 Missing values will be propagated, and the data will be coerced to another
 dtype if needed.
@@ -123,3 +134,15 @@ Reduction and groupby operations such as 'sum' work as well.
 
    df.sum()
    df.groupby('B').A.sum()
+
+Scalar NA Value
+---------------
+
+:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar
+missing value. Slicing a single element that's missing will return
+:attr:`pandas.NA`
+
+.. ipython:: python
+
+   a = pd.array([1, None], dtype="Int64")
+   a[1]
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -712,7 +712,6 @@ def all(self, skipna: bool = True, **kwargs):
     @classmethod
     def _create_logical_method(cls, op):
         def logical_method(self, other):
-
             if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
                 # Rely on pandas to unbox and dispatch to us.
                 return NotImplemented
@@ -760,8 +759,11 @@ def logical_method(self, other):
     @classmethod
     def _create_comparison_method(cls, op):
         def cmp_method(self, other):
+            from pandas.arrays import IntegerArray
 
-            if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
+            if isinstance(
+                other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray)
+            ):
                 # Rely on pandas to unbox and dispatch to us.
                 return NotImplemented
 

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -1,10 +1,10 @@
 import numbers
-from typing import Type
+from typing import Any, Tuple, Type
 import warnings
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import lib, missing as libmissing
 from pandas.compat import set_function_name
 from pandas.util._decorators import cache_readonly
 
@@ -44,7 +44,7 @@ class _IntegerDtype(ExtensionDtype):
     name: str
     base = None
     type: Type
-    na_value = np.nan
+    na_value = libmissing.NA
 
     def __repr__(self) -> str:
         sign = "U" if self.is_unsigned_integer else ""
@@ -263,6 +263,11 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin):
 
     .. versionadded:: 0.24.0
 
+    .. versionchanged:: 1.0.0
+
+       Now uses :attr:`pandas.NA` as its missing value, rather
+       than :attr:`numpy.nan`.
+
     .. warning::
 
        IntegerArray is currently experimental, and its API or internal
@@ -358,29 +363,37 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
     def _from_factorized(cls, values, original):
         return integer_array(values, dtype=original.dtype)
 
-    def _formatter(self, boxed=False):
-        def fmt(x):
-            if isna(x):
-                return "NaN"
-            return str(x)
-
-        return fmt
-
     def __getitem__(self, item):
         if is_integer(item):
             if self._mask[item]:
                 return self.dtype.na_value
             return self._data[item]
         return type(self)(self._data[item], self._mask[item])
 
-    def _coerce_to_ndarray(self):
+    def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default):
         """
         coerce to an ndarary of object dtype
         """
+        if dtype is None:
+            dtype = object
+
+        if na_value is lib._no_default and is_float_dtype(dtype):
+            na_value = np.nan
+        elif na_value is lib._no_default:
+            na_value = libmissing.NA
+
+        if is_integer_dtype(dtype):
+            # Specifically, a NumPy integer dtype, not a pandas integer dtype,
+            # since we're coercing to a numpy dtype by definition in this function.
+            if not self.isna().any():
+                return self._data.astype(dtype)
+            else:
+                raise ValueError(
+                    "cannot convert to integer NumPy array with missing values"
+                )
 
-        # TODO(jreback) make this better
-        data = self._data.astype(object)
-        data[self._mask] = self._na_value
+        data = self._data.astype(dtype)
+        data[self._mask] = na_value
         return data
 
     __array_priority__ = 1000  # higher than ndarray so ops dispatch to us
@@ -390,7 +403,7 @@ def __array__(self, dtype=None):
         the array interface, return my values
         We return an object array here to preserve our scalar values
         """
-        return self._coerce_to_ndarray()
+        return self._coerce_to_ndarray(dtype=dtype)
 
     def __arrow_array__(self, type=None):
         """
@@ -506,7 +519,7 @@ def isna(self):
 
     @property
     def _na_value(self):
-        return np.nan
+        return self.dtype.na_value
 
     @classmethod
     def _concat_same_type(cls, to_concat):
@@ -545,7 +558,7 @@ def astype(self, dtype, copy=True):
             return type(self)(result, mask=self._mask, copy=False)
 
         # coerce
-        data = self._coerce_to_ndarray()
+        data = self._coerce_to_ndarray(dtype=dtype)
         return astype_nansafe(data, dtype, copy=None)
 
     @property
@@ -600,12 +613,19 @@ def value_counts(self, dropna=True):
             # w/o passing the dtype
             array = np.append(array, [self._mask.sum()])
             index = Index(
-                np.concatenate([index.values, np.array([np.nan], dtype=object)]),
+                np.concatenate(
+                    [index.values, np.array([self.dtype.na_value], dtype=object)]
+                ),
                 dtype=object,
             )
 
         return Series(array, index=index)
 
+    def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
+        # TODO: https://github.com/pandas-dev/pandas/issues/30037
+        # use masked algorithms, rather than object-dtype / np.nan.
+        return self._coerce_to_ndarray(na_value=np.nan), np.nan
+
     def _values_for_argsort(self) -> np.ndarray:
         """Return values for sorting.
 
@@ -629,9 +649,11 @@ def _create_comparison_method(cls, op):
 
         @unpack_zerodim_and_defer(op.__name__)
         def cmp_method(self, other):
+            from pandas.arrays import BooleanArray
+
             mask = None
 
-            if isinstance(other, IntegerArray):
+            if isinstance(other, (BooleanArray, IntegerArray)):
                 other, mask = other._data, other._mask
 
             elif is_list_like(other):
@@ -643,25 +665,30 @@ def cmp_method(self, other):
                 if len(self) != len(other):
                     raise ValueError("Lengths must match to compare")
 
-            # numpy will show a DeprecationWarning on invalid elementwise
-            # comparisons, this will raise in the future
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", "elementwise", FutureWarning)
-                with np.errstate(all="ignore"):
-                    method = getattr(self._data, f"__{op_name}__")
-                    result = method(other)
+            if other is libmissing.NA:
+                # numpy does not handle pd.NA well as "other" scalar (it returns
+                # a scalar False instead of an array)
+                # This may be fixed by NA.__array_ufunc__. Revisit this check
+                # once that's implemented.
+                result = np.zeros(self._data.shape, dtype="bool")
+                mask = np.ones(self._data.shape, dtype="bool")
+            else:
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", "elementwise", FutureWarning)
+                    with np.errstate(all="ignore"):
+                        method = getattr(self._data, f"__{op_name}__")
+                        result = method(other)
 
                     if result is NotImplemented:
                         result = invalid_comparison(self._data, other, op)
 
             # nans propagate
             if mask is None:
-                mask = self._mask
+                mask = self._mask.copy()
             else:
                 mask = self._mask | mask
 
-            result[mask] = op_name == "ne"
-            return result
+            return BooleanArray(result, mask)
 
         name = f"__{op.__name__}__"
         return set_function_name(cmp_method, name, cls)
@@ -673,7 +700,8 @@ def _reduce(self, name, skipna=True, **kwargs):
         # coerce to a nan-aware float if needed
         if mask.any():
             data = self._data.astype("float64")
-            data[mask] = self._na_value
+            # We explicitly use NaN within reductions.
+            data[mask] = np.nan
 
         op = getattr(nanops, "nan" + name)
         result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
@@ -739,12 +767,13 @@ def integer_arithmetic_method(self, other):
                     raise TypeError("can only perform ops with numeric values")
 
             else:
-                if not (is_float(other) or is_integer(other)):
+                if not (is_float(other) or is_integer(other) or other is libmissing.NA):
                     raise TypeError("can only perform ops with numeric values")
 
-            # nans propagate
             if omask is None:
                 mask = self._mask.copy()
+                if other is libmissing.NA:
+                    mask |= True
             else:
                 mask = self._mask | omask
 
@@ -754,20 +783,23 @@ def integer_arithmetic_method(self, other):
                 # x ** 0 is 1.
                 if omask is not None:
                     mask = np.where((other == 0) & ~omask, False, mask)
-                else:
+                elif other is not libmissing.NA:
                     mask = np.where(other == 0, False, mask)
 
             elif op_name == "rpow":
                 # 1 ** x is 1.
                 if omask is not None:
                     mask = np.where((other == 1) & ~omask, False, mask)
-                else:
+                elif other is not libmissing.NA:
                     mask = np.where(other == 1, False, mask)
                 # x ** 0 is 1.
                 mask = np.where((self._data == 0) & ~self._mask, False, mask)
 
-            with np.errstate(all="ignore"):
-                result = op(self._data, other)
+            if other is libmissing.NA:
+                result = np.ones_like(self._data)
+            else:
+                with np.errstate(all="ignore"):
+                    result = op(self._data, other)
 
             # divmod returns a tuple
             if op_name == "divmod":
@@ -790,6 +822,11 @@ def integer_arithmetic_method(self, other):
 _dtype_docstring = """
 An ExtensionDtype for {dtype} integer data.
 
+.. versionchanged:: 1.0.0
+
+   Now uses :attr:`pandas.NA` as its missing value,
+   rather than :attr:`numpy.nan`.
+
 Attributes
 ----------
 None