BUG: algorithms.factorize moves null values when sort=False (#46601)

rhshadrach · asv-bot · web-flow · commit 56a71845decd · 2022-08-18T09:09:09.000-07:00
* BUG: algos.factorizes moves null values when sort=False

* Implementation for pyarrow &lt; 4.0

* fixup

* fixups

* test fixup

* type-hints

* improvements

* remove override of _from_factorized in string_.py

* Rework na_sentinel/dropna/ignore_na

* fixups

* fixup for pyarrow &lt; 4.0

* whatsnew note

* doc fixup

* fixups

* fixup whatsnew note

* whatsnew note; comment on old vs new API

Co-authored-by: asv-bot &lt;pandas.benchmarks@gmail.com&gt;
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -299,6 +299,7 @@ Other enhancements
 - Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`)
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`)
 - :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`)
+- The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
@@ -922,6 +923,7 @@ Numeric
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
 - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
+- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`)
 
 Conversion
 ^^^^^^^^^^
@@ -1093,6 +1095,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would not respect ``dropna=False`` when the input DataFrame/Series had a NaN values in a :class:`MultiIndex` (:issue:`46783`)
 - Bug in :meth:`DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list which misses the resample key (:issue:`47362`)
 - Bug in :meth:`DataFrame.groupby` would lose index columns when the DataFrame is empty for transforms, like fillna (:issue:`47787`)
+- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` with ``dropna=False`` and ``sort=False`` would put any null groups at the end instead the order that they are encountered (:issue:`46584`)
 -
 
 Reshaping
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -711,7 +711,7 @@ cdef class {{name}}HashTable(HashTable):
                             return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -743,7 +743,7 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques_vector = {{name}}Vector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True, mask=mask,
+                            na_value=na_value, ignore_na=ignore_na, mask=mask,
                             return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1092,7 +1092,7 @@ cdef class StringHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1122,7 +1122,7 @@ cdef class StringHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1347,7 +1347,7 @@ cdef class PyObjectHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1377,7 +1377,7 @@ cdef class PyObjectHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -521,7 +521,7 @@ def f(c, v):
 
 def factorize_array(
     values: np.ndarray,
-    na_sentinel: int = -1,
+    na_sentinel: int | None = -1,
     size_hint: int | None = None,
     na_value: object = None,
     mask: npt.NDArray[np.bool_] | None = None,
@@ -552,6 +552,10 @@ def factorize_array(
     codes : ndarray[np.intp]
     uniques : ndarray
     """
+    ignore_na = na_sentinel is not None
+    if not ignore_na:
+        na_sentinel = -1
+
     original = values
     if values.dtype.kind in ["m", "M"]:
         # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
@@ -564,7 +568,11 @@ def factorize_array(
 
     table = hash_klass(size_hint or len(values))
     uniques, codes = table.factorize(
-        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
+        values,
+        na_sentinel=na_sentinel,
+        na_value=na_value,
+        mask=mask,
+        ignore_na=ignore_na,
     )
 
     # re-cast e.g. i8->dt64/td64, uint8->bool
@@ -738,6 +746,10 @@ def factorize(
     # responsible only for factorization. All data coercion, sorting and boxing
     # should happen here.
 
+    # GH#46910 deprecated na_sentinel in favor of use_na_sentinel:
+    #   na_sentinel=None corresponds to use_na_sentinel=False
+    #   na_sentinel=-1 correspond to use_na_sentinel=True
+    # Other na_sentinel values will not be supported when the deprecation is enforced.
     na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
     if isinstance(values, ABCRangeIndex):
         return values.factorize(sort=sort)
@@ -749,10 +761,7 @@ def factorize(
 
     # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
     # of values, assign na_sentinel=-1 to replace code value for NaN.
-    dropna = True
-    if na_sentinel is None:
-        na_sentinel = -1
-        dropna = False
+    dropna = na_sentinel is not None
 
     if (
         isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
@@ -765,38 +774,58 @@ def factorize(
 
     elif not isinstance(values.dtype, np.dtype):
         if (
-            na_sentinel == -1
-            and "use_na_sentinel" in inspect.signature(values.factorize).parameters
-        ):
+            na_sentinel == -1 or na_sentinel is None
+        ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
             # Avoid using catch_warnings when possible
             # GH#46910 - TimelikeOps has deprecated signature
             codes, uniques = values.factorize(  # type: ignore[call-arg]
-                use_na_sentinel=True
+                use_na_sentinel=na_sentinel is not None
             )
         else:
+            na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
             with warnings.catch_warnings():
                 # We've already warned above
                 warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
-                codes, uniques = values.factorize(na_sentinel=na_sentinel)
+                codes, uniques = values.factorize(na_sentinel=na_sentinel_arg)
 
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
+        # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
+        #       na_sentinel is None we append NA on the end because safe_sort does not
+        #       handle null values in uniques.
+        if na_sentinel is None and sort:
+            na_sentinel_arg = -1
+        elif na_sentinel is None:
+            na_sentinel_arg = None
+        else:
+            na_sentinel_arg = na_sentinel
         codes, uniques = factorize_array(
-            values, na_sentinel=na_sentinel, size_hint=size_hint
+            values,
+            na_sentinel=na_sentinel_arg,
+            size_hint=size_hint,
         )
 
     if sort and len(uniques) > 0:
+        if na_sentinel is None:
+            # TODO: Can remove when na_sentinel=na_sentinel as in TODO above
+            na_sentinel = -1
         uniques, codes = safe_sort(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
-    code_is_na = codes == na_sentinel
-    if not dropna and code_is_na.any():
-        # na_value is set based on the dtype of uniques, and compat set to False is
-        # because we do not want na_value to be 0 for integers
-        na_value = na_value_for_dtype(uniques.dtype, compat=False)
-        uniques = np.append(uniques, [na_value])
-        codes = np.where(code_is_na, len(uniques) - 1, codes)
+    if not dropna and sort:
+        # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
+        if na_sentinel is None:
+            na_sentinel_arg = -1
+        else:
+            na_sentinel_arg = na_sentinel
+        code_is_na = codes == na_sentinel_arg
+        if code_is_na.any():
+            # na_value is set based on the dtype of uniques, and compat set to False is
+            # because we do not want na_value to be 0 for integers
+            na_value = na_value_for_dtype(uniques.dtype, compat=False)
+            uniques = np.append(uniques, [na_value])
+            codes = np.where(code_is_na, len(uniques) - 1, codes)
 
     uniques = _reconstruct_data(uniques, original.dtype, original)
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -551,20 +551,32 @@ def factorize(
         use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     ) -> tuple[np.ndarray, ExtensionArray]:
         resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        if pa_version_under4p0:
+            encoded = self._data.dictionary_encode()
         else:
-            na_sentinel = resolved_na_sentinel
-        encoded = self._data.dictionary_encode()
+            null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
+            encoded = self._data.dictionary_encode(null_encoding=null_encoding)
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
         ).to_pandas()
         if indices.dtype.kind == "f":
-            indices[np.isnan(indices)] = na_sentinel
+            indices[np.isnan(indices)] = (
+                resolved_na_sentinel if resolved_na_sentinel is not None else -1
+            )
         indices = indices.astype(np.int64, copy=False)
 
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)
+            if resolved_na_sentinel is None and pa_version_under4p0:
+                # TODO: share logic with BaseMaskedArray.factorize
+                # Insert na with the proper code
+                na_mask = indices.values == -1
+                na_index = na_mask.argmax()
+                if na_mask[na_index]:
+                    uniques = uniques.insert(na_index, self.dtype.na_value)
+                    na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
+                    indices[indices >= na_code] += 1
+                    indices[indices == -1] = na_code
         else:
             uniques = type(self)(pa.array([], type=encoded.type.value_type))
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1081,14 +1081,10 @@ def factorize(
         # 2. ExtensionArray.factorize.
         #    Complete control over factorization.
         resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
-        else:
-            na_sentinel = resolved_na_sentinel
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, na_value=na_value
+            arr, na_sentinel=resolved_na_sentinel, na_value=na_value
         )
 
         uniques_ea = self._from_factorized(uniques, self)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -886,19 +886,39 @@ def factorize(
         use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     ) -> tuple[np.ndarray, ExtensionArray]:
         resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
-        else:
-            na_sentinel = resolved_na_sentinel
         arr = self._data
         mask = self._mask
 
-        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+        # Pass non-None na_sentinel; recode and add NA to uniques if necessary below
+        na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel
+        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask)
 
         # check that factorize_array correctly preserves dtype.
         assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
 
-        uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
+        has_na = mask.any()
+        if resolved_na_sentinel is not None or not has_na:
+            size = len(uniques)
+        else:
+            # Make room for an NA value
+            size = len(uniques) + 1
+        uniques_mask = np.zeros(size, dtype=bool)
+        if resolved_na_sentinel is None and has_na:
+            na_index = mask.argmax()
+            # Insert na with the proper code
+            if na_index == 0:
+                na_code = np.intp(0)
+            else:
+                # mypy error: Slice index must be an integer or None
+                # https://github.com/python/mypy/issues/2410
+                na_code = codes[:na_index].argmax() + 1  # type: ignore[misc]
+            codes[codes >= na_code] += 1
+            codes[codes == -1] = na_code
+            # dummy value for uniques; not used since uniques_mask will be True
+            uniques = np.insert(uniques, na_code, 0)
+            uniques_mask[na_code] = True
+        uniques_ea = type(self)(uniques, uniques_mask)
+
         return codes, uniques_ea
 
     @doc(ExtensionArray._values_for_argsort)
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -879,6 +879,10 @@ def factorize(
         codes, uniques = algos.factorize(
             np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
         )
+        if na_sentinel is lib.no_default:
+            na_sentinel = -1
+        if use_na_sentinel is lib.no_default or use_na_sentinel:
+            codes[codes == -1] = na_sentinel
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -380,8 +380,8 @@ def __arrow_array__(self, type=None):
     def _values_for_factorize(self):
         arr = self._ndarray.copy()
         mask = self.isna()
-        arr[mask] = -1
-        return arr, -1
+        arr[mask] = None
+        return arr, None
 
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -658,9 +658,10 @@ def group_index(self) -> Index:
 
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
-        if self._passed_categorical:
+        if self._dropna and self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
-            # preserving the categories / ordered attributes
+            # preserving the categories / ordered attributes;
+            # doesn't (yet - GH#46909) handle dropna=False
             cat = self.grouping_vector
             categories = cat.categories
 
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py