From 246b78768a0ef9d514d918dde51f094d4bddaa5d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 27 Mar 2020 10:26:28 +0100
Subject: [PATCH 01/10] ENH/PERF: use mask in factorize

---
 pandas/_libs/hashtable_class_helper.pxi.in | 35 +++++++++++++++++-----
 pandas/core/algorithms.py                  | 10 +++++--
 pandas/core/arrays/integer.py              | 12 +++++++-
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 3ce3bc519b311..866b16178be48 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -368,7 +368,7 @@ cdef class {{name}}HashTable(HashTable):
     def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                 object na_value=None, bint ignore_na=False,
-                bint return_inverse=False):
+                object mask=None, bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -391,6 +391,10 @@ cdef class {{name}}HashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
+        mask : ndarray[bool], optional
+            If not None, the mask is used as indicator for missing values
+            (True = missing, False = valid) instead of `na_value` or
+            condition "val != val".
         return_inverse : boolean, default False
             Whether the mapping of the original array values to their location
             in the vector of uniques should be returned.
@@ -409,12 +413,17 @@ cdef class {{name}}HashTable(HashTable):
             {{dtype}}_t val, na_value2
             khiter_t k
             {{name}}VectorData *ud
-            bint use_na_value
+            bint use_na_value, use_mask
+            uint8_t[:] mask_values
 
         if return_inverse:
             labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
+        use_mask = mask is not None
+
+        if use_mask:
+            mask_values = mask.view("uint8")
 
         if use_na_value:
             # We need this na_value2 because we want to allow users
@@ -430,7 +439,11 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if ignore_na and (
+                if ignore_na and use_mask:
+                    if mask_values[i]:
+                        labels[i] = na_sentinel
+                        continue
+                elif ignore_na and (
                 {{if not name.lower().startswith(("uint", "int"))}}
                 val != val or
                 {{endif}}
@@ -494,7 +507,7 @@ cdef class {{name}}HashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -512,6 +525,10 @@ cdef class {{name}}HashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            If not None, the mask is used as indicator for missing values
+            (True = missing, False = valid) instead of `na_value` or
+            condition "val != val".
 
         Returns
         -------
@@ -522,7 +539,7 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques_vector = {{name}}Vector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=True, mask=mask,
                             return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -855,7 +872,7 @@ cdef class StringHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -873,6 +890,8 @@ cdef class StringHashTable(HashTable):
             that is not a string is considered missing. If na_value is
             not None, then _additionally_ any value "val" satisfying
             val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            Not yet implementd for StringHashTable.
 
         Returns
         -------
@@ -1094,7 +1113,7 @@ cdef class PyObjectHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1112,6 +1131,8 @@ cdef class PyObjectHashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            Not yet implemented for PyObjectHashTable.
 
         Returns
         -------
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 9afdb82467f90..444b3610ef0a6 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -455,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
 
 def _factorize_array(
-    values, na_sentinel: int = -1, size_hint=None, na_value=None
+    values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
     Factorize an array-like to codes and uniques.
@@ -473,6 +473,10 @@ def _factorize_array(
         parameter when you know that you don't have any values pandas would
         consider missing in the array (NaN for float data, iNaT for
         datetimes, etc.).
+    mask : ndarray[bool], optional
+        If not None, the mask is used as indicator for missing values
+        (True = missing, False = valid) instead of `na_value` or
+        condition "val != val".
 
     Returns
     -------
@@ -482,7 +486,9 @@ def _factorize_array(
     hash_klass, values = _get_data_algo(values)
 
     table = hash_klass(size_hint or len(values))
-    uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)
+    uniques, codes = table.factorize(
+        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
+    )
 
     codes = ensure_platform_int(codes)
     return codes, uniques
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index f2880c5cbee42..e03b89dfda2b1 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import nanops, ops
+from pandas.core.algorithms import _factorize_array
 import pandas.core.common as com
 from pandas.core.indexers import check_array_indexer
 from pandas.core.ops import invalid_comparison
@@ -481,7 +482,16 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
     def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
         # TODO: https://github.com/pandas-dev/pandas/issues/30037
         # use masked algorithms, rather than object-dtype / np.nan.
-        return self.to_numpy(na_value=np.nan), np.nan
+        return self.to_numpy(dtype=float, na_value=np.nan), np.nan
+
+    def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]:
+        arr = self._data
+        mask = self._mask
+
+        codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+
+        uniques = IntegerArray(uniques, np.zeros(len(uniques), dtype=bool))
+        return codes, uniques
 
     def _values_for_argsort(self) -> np.ndarray:
         """

From af1cdeaab0b5b2f86f21b060ee348701037affbf Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 27 Mar 2020 12:57:19 +0100
Subject: [PATCH 02/10] fix typing

---
 pandas/core/arrays/integer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index e03b89dfda2b1..24f5760193432 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -484,7 +484,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
         # use masked algorithms, rather than object-dtype / np.nan.
         return self.to_numpy(dtype=float, na_value=np.nan), np.nan
 
-    def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]:
+    def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "IntegerArray"]:
         arr = self._data
         mask = self._mask
 

From be5a21dc9a567ee4e2ec6e6be4690c21dae4da04 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 2 Apr 2020 16:27:23 +0200
Subject: [PATCH 03/10] override in factorize, moved to base masked

---
 pandas/core/arrays/boolean.py          | 13 ++++---------
 pandas/core/arrays/integer.py          | 19 -------------------
 pandas/core/arrays/masked.py           | 17 +++++++++++++++--
 pandas/tests/extension/base/methods.py |  2 ++
 4 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
index 442d4ca8cef6d..370b00d3ca562 100644
--- a/pandas/core/arrays/boolean.py
+++ b/pandas/core/arrays/boolean.py
@@ -83,6 +83,10 @@ def type(self) -> Type[np.bool_]:
     def kind(self) -> str:
         return "b"
 
+    @property
+    def numpy_dtype(self) -> np.dtype:
+        return np.dtype("bool")
+
     @classmethod
     def construct_array_type(cls) -> Type["BooleanArray"]:
         """
@@ -314,15 +318,6 @@ def map_string(s):
         scalars = [map_string(x) for x in strings]
         return cls._from_sequence(scalars, dtype, copy)
 
-    def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
-        data = self._data.astype("int8")
-        data[self._mask] = -1
-        return data, -1
-
-    @classmethod
-    def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray":
-        return cls._from_sequence(values, dtype=original.dtype)
-
     _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
 
     def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index 9bd80cca747da..3a84ce59bd0e0 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -27,7 +27,6 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import nanops, ops
-from pandas.core.algorithms import _factorize_array
 from pandas.core.array_algos import masked_reductions
 import pandas.core.common as com
 from pandas.core.indexers import check_array_indexer
@@ -367,10 +366,6 @@ def _from_sequence_of_strings(
         scalars = to_numeric(strings, errors="raise")
         return cls._from_sequence(scalars, dtype, copy)
 
-    @classmethod
-    def _from_factorized(cls, values, original) -> "IntegerArray":
-        return integer_array(values, dtype=original.dtype)
-
     _HANDLED_TYPES = (np.ndarray, numbers.Number)
 
     def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
@@ -480,20 +475,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
         data = self.to_numpy(dtype=dtype, **kwargs)
         return astype_nansafe(data, dtype, copy=False)
 
-    def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
-        # TODO: https://github.com/pandas-dev/pandas/issues/30037
-        # use masked algorithms, rather than object-dtype / np.nan.
-        return self.to_numpy(dtype=float, na_value=np.nan), np.nan
-
-    def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "IntegerArray"]:
-        arr = self._data
-        mask = self._mask
-
-        codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
-
-        uniques = IntegerArray(uniques, np.zeros(len(uniques), dtype=bool))
-        return codes, uniques
-
     def _values_for_argsort(self) -> np.ndarray:
         """
         Return values for sorting.
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index d23d26d870f75..d2abe7eb340b2 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -1,14 +1,15 @@
-from typing import TYPE_CHECKING, Optional, Type, TypeVar
+from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
 
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
 from pandas._typing import Scalar
+from pandas.util._decorators import doc
 
 from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
 from pandas.core.dtypes.missing import isna, notna
 
-from pandas.core.algorithms import take
+from pandas.core.algorithms import _factorize_array, take
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
 from pandas.core.indexers import check_array_indexer
 
@@ -217,6 +218,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
         mask = mask.copy()
         return type(self)(data, mask, copy=False)
 
+    @doc(ExtensionArray.factorize)
+    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
+        arr = self._data
+        mask = self._mask
+
+        codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+
+        # the hashtables don't handle all different types of bits
+        uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)
+        uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
+        return codes, uniques
+
     def value_counts(self, dropna: bool = True) -> "Series":
         """
         Returns a Series containing counts of each unique value.
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index 22e53dbc89f01..c3d25e8cb26c2 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -133,6 +133,8 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
 
         tm.assert_numpy_array_equal(codes_1, codes_2)
         self.assert_extension_array_equal(uniques_1, uniques_2)
+        assert len(uniques_1) == len(pd.unique(uniques_1))
+        assert uniques_1.dtype == data_for_grouping.dtype
 
     def test_factorize_empty(self, data):
         codes, uniques = pd.factorize(data[:0])

From b0a88b9e2edc81c8764f933d2bc80caf0dfe5ab6 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 2 Apr 2020 16:40:25 +0200
Subject: [PATCH 04/10] add Int64/boolean to factorize benchmarks

---
 asv_bench/benchmarks/algorithms.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 1768e682b3db4..48c75262b8df4 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -34,7 +34,16 @@ class Factorize:
     params = [
         [True, False],
         [True, False],
-        ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
+        [
+            "int",
+            "uint",
+            "float",
+            "string",
+            "datetime64[ns]",
+            "datetime64[ns, tz]",
+            "Int64",
+            "boolean",
+        ],
     ]
     param_names = ["unique", "sort", "dtype"]
 
@@ -49,13 +58,21 @@ def setup(self, unique, sort, dtype):
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
             ),
+            "Int64": pd.array(np.arange(N), dtype="Int64"),
+            "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
         }[dtype]
         if not unique:
             data = data.repeat(5)
         self.idx = data
+        if dtype in ("Int64", "boolean") and sort:
+            # sort is not a keyword on EAs
+            raise NotImplementedError
 
     def time_factorize(self, unique, sort, dtype):
-        self.idx.factorize(sort=sort)
+        if sort:
+            self.idx.factorize(sort=sort)
+        else:
+            self.idx.factorize()
 
 
 class Duplicated:

From 2e948428d2240f10661cd66950a2bab8c9d9b8d3 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 2 Apr 2020 16:43:37 +0200
Subject: [PATCH 05/10] add whatsnew

---
 doc/source/whatsnew/v1.1.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 25f847c698278..141edd171875d 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -274,6 +274,7 @@ Performance improvements
   :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
   :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
 - Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`).
+- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
 
 
 .. ---------------------------------------------------------------------------

From c97d35712fc4fdd3dcce6082837cd3aa220b7fcb Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 4 Apr 2020 09:15:49 +0200
Subject: [PATCH 06/10] add # type: ignore

---
 pandas/core/arrays/masked.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index d2abe7eb340b2..52e31144a1468 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -226,7 +226,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
         codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
 
         # the hashtables don't handle all different types of bits
-        uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)
+        uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)  # type: ignore
         uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
         return codes, uniques
 

From a6bc6fc1aedf388aa9977473bdbe937bcee3c60b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 7 Apr 2020 08:56:21 +0200
Subject: [PATCH 07/10] update benchmark

---
 asv_bench/benchmarks/algorithms.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 48c75262b8df4..65e52e03c43c7 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -63,16 +63,10 @@ def setup(self, unique, sort, dtype):
         }[dtype]
         if not unique:
             data = data.repeat(5)
-        self.idx = data
-        if dtype in ("Int64", "boolean") and sort:
-            # sort is not a keyword on EAs
-            raise NotImplementedError
+        self.data = data
 
     def time_factorize(self, unique, sort, dtype):
-        if sort:
-            self.idx.factorize(sort=sort)
-        else:
-            self.idx.factorize()
+        pd.factorize(self.data, sort=sort)
 
 
 class Duplicated:

From 889b1b997242232c811ad0e3a39d954a03018b4b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 7 Apr 2020 22:33:21 +0200
Subject: [PATCH 08/10] add BaseMaskedDtype

---
 pandas/core/arrays/boolean.py | 16 ++--------------
 pandas/core/arrays/integer.py |  6 ++----
 pandas/core/arrays/masked.py  | 20 +++++++++++++++++++-
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
index d11bf4bb41df2..254ee0b34411e 100644
--- a/pandas/core/arrays/boolean.py
+++ b/pandas/core/arrays/boolean.py
@@ -9,7 +9,6 @@
 from pandas.compat import set_function_name
 from pandas.compat.numpy import function as nv
 
-from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -30,14 +29,14 @@
 from pandas.core.array_algos import masked_reductions
 from pandas.core.indexers import check_array_indexer
 
-from .masked import BaseMaskedArray
+from .masked import BaseMaskedArray, BaseMaskedDtype
 
 if TYPE_CHECKING:
     import pyarrow  # noqa: F401
 
 
 @register_extension_dtype
-class BooleanDtype(ExtensionDtype):
+class BooleanDtype(BaseMaskedDtype):
     """
     Extension dtype for boolean data.
 
@@ -64,17 +63,6 @@ class BooleanDtype(ExtensionDtype):
 
     name = "boolean"
 
-    @property
-    def na_value(self) -> libmissing.NAType:
-        """
-        BooleanDtype uses :attr:`pandas.NA` as the missing NA value.
-
-        .. warning::
-
-           `na_value` may change in a future release.
-        """
-        return libmissing.NA
-
     @property
     def type(self) -> Type[np.bool_]:
         return np.bool_
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index 649f7f3c4751d..74ea28953325d 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -9,7 +9,6 @@
 from pandas.compat import set_function_name
 from pandas.util._decorators import cache_readonly
 
-from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -34,13 +33,13 @@
 from pandas.core.ops.common import unpack_zerodim_and_defer
 from pandas.core.tools.numeric import to_numeric
 
-from .masked import BaseMaskedArray
+from .masked import BaseMaskedArray, BaseMaskedDtype
 
 if TYPE_CHECKING:
     import pyarrow  # noqa: F401
 
 
-class _IntegerDtype(ExtensionDtype):
+class _IntegerDtype(BaseMaskedDtype):
     """
     An ExtensionDtype to hold a single size & kind of integer dtype.
 
@@ -53,7 +52,6 @@ class _IntegerDtype(ExtensionDtype):
     name: str
     base = None
     type: Type
-    na_value = libmissing.NA
 
     def __repr__(self) -> str:
         sign = "U" if self.is_unsigned_integer else ""
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 52e31144a1468..dc8b9da039edd 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -4,8 +4,10 @@
 
 from pandas._libs import lib, missing as libmissing
 from pandas._typing import Scalar
+from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 
+from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
 from pandas.core.dtypes.missing import isna, notna
 
@@ -20,6 +22,18 @@
 BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
 
 
+class BaseMaskedDtype(ExtensionDtype):
+    """
+    Base class for dtypes for BasedMaskedArray subclasses.
+    """
+
+    na_value = libmissing.NA
+
+    @property
+    def numpy_dtype(self) -> np.dtype:
+        raise AbstractMethodError
+
+
 class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
     """
     Base class for masked arrays (which use _data and _mask to store the data).
@@ -38,6 +52,10 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
         self._data = values
         self._mask = mask
 
+    @property
+    def dtype(self) -> BaseMaskedDtype:
+        raise AbstractMethodError(self)
+
     def __getitem__(self, item):
         if is_integer(item):
             if self._mask[item]:
@@ -226,7 +244,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
         codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
 
         # the hashtables don't handle all different types of bits
-        uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)  # type: ignore
+        uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)
         uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
         return codes, uniques
 

From 0370045436d35fb5d6f663fa1150a0135a1765ef Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 7 Apr 2020 22:53:32 +0200
Subject: [PATCH 09/10] use class variable annotation

---
 pandas/core/arrays/masked.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index dc8b9da039edd..56d355499438c 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -43,6 +43,7 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
 
     # The value used to fill '_data' to avoid upcasting
     _internal_fill_value: Scalar
+    dtype: BaseMaskedDtype
 
     def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
         if copy:
@@ -52,10 +53,6 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
         self._data = values
         self._mask = mask
 
-    @property
-    def dtype(self) -> BaseMaskedDtype:
-        raise AbstractMethodError(self)
-
     def __getitem__(self, item):
         if is_integer(item):
             if self._mask[item]:

From 6ed52397baa62d339aba2a3fa44f3e719d8a5aa0 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 1 May 2020 13:55:17 +0200
Subject: [PATCH 10/10] use abstract dtype

---
 pandas/core/arrays/masked.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 59adbb16bc8f6..127de82e318a2 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -43,7 +43,6 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
 
     # The value used to fill '_data' to avoid upcasting
     _internal_fill_value: Scalar
-    dtype: BaseMaskedDtype
 
     def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
         # values is supposed to already be validated in the subclass
@@ -64,6 +63,10 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
         self._data = values
         self._mask = mask
 
+    @property
+    def dtype(self) -> BaseMaskedDtype:
+        raise AbstractMethodError(self)
+
     def __getitem__(self, item):
         if is_integer(item):
             if self._mask[item]: