@@ -77,7 +77,21 @@ def _constructor_from_sequence(cls, scalars):
77
77
raise AbstractMethodError (cls )
78
78
79
79
@classmethod
80
- def _constructor_from_simple_ndarray (cls , values , instance ):
80
+ def _from_factorized (cls , values , original ):
81
+ """Reconstruct an ExtensionArray after factorization.
82
+
83
+ Parameters
84
+ ----------
85
+ values : ndarray
86
+ An integer ndarray with the factorized values.
87
+ original : ExtensionArray
88
+ The original ndarray that was factorized.
89
+
90
+ See Also
91
+ --------
92
+ pandas.factorize
93
+ ExtensionArray.factorize
94
+ """
81
95
raise AbstractMethodError (cls )
82
96
83
97
# ------------------------------------------------------------------------
@@ -305,7 +319,16 @@ def unique(self):
305
319
uniques = unique (self .astype (object ))
306
320
return self ._constructor_from_sequence (uniques )
307
321
308
- def _simple_ndarray (self ):
322
+ def _values_for_factorize (self ):
323
+ """Return an array suitable for factorization.
324
+
325
+ Returns
326
+ -------
327
+ ndarray
328
+ An array suitable for factoraization. This should maintain order
329
+ and be a supported dtype.
330
+
331
+ """
309
332
return self .astype (object )
310
333
311
334
def factorize (self , na_sentinel = - 1 ):
@@ -337,17 +360,17 @@ def factorize(self, na_sentinel=-1):
337
360
-----
338
361
:meth:`pandas.factorize` offers a `sort` keyword as well.
339
362
"""
340
- # Implementor note: make sure to exclude missing values from your
341
- # `uniques`. It should only contain non-NA values .
363
+ # Implementor notes: There are two options for overriding the
364
+ # behavior of `factorize`: here and `_values_for_factorize` .
342
365
from pandas .core .algorithms import _factorize_array
343
366
344
367
mask = self .isna ()
345
- arr = self ._simple_ndarray ()
368
+ arr = self ._values_for_factorize ()
346
369
arr [mask ] = np .nan
347
370
348
371
labels , uniques = _factorize_array (arr , check_nulls = True ,
349
372
na_sentinel = na_sentinel )
350
- uniques = self ._constructor_from_simple_ndarray (uniques , instance = arr )
373
+ uniques = self ._from_factorized (uniques , arr )
351
374
return labels , uniques
352
375
353
376
# ------------------------------------------------------------------------
0 commit comments