@@ -547,21 +547,21 @@ def factorize(
547
547
resolved_na_sentinel = resolve_na_sentinel (na_sentinel , use_na_sentinel )
548
548
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
549
549
encoded = self ._data .dictionary_encode (null_encoding = null_encoding )
550
- indices = pa .chunked_array (
551
- [c .indices for c in encoded .chunks ], type = encoded .type .index_type
552
- ).to_pandas ()
553
- if indices .dtype .kind == "f" :
554
- indices [np .isnan (indices )] = (
555
- resolved_na_sentinel if resolved_na_sentinel is not None else - 1
550
+ if encoded .length () == 0 :
551
+ indices = np .array ([], dtype = np .intp )
552
+ uniques = type (self )(pa .chunked_array ([], type = encoded .type .value_type ))
553
+ else :
554
+ pa_indices = encoded .combine_chunks ().indices
555
+ if pa_indices .null_count > 0 :
556
+ fill_value = (
557
+ resolved_na_sentinel if resolved_na_sentinel is not None else - 1
558
+ )
559
+ pa_indices = pc .fill_null (pa_indices , fill_value )
560
+ indices = pa_indices .to_numpy (zero_copy_only = False , writable = True ).astype (
561
+ np .intp , copy = False
556
562
)
557
- indices = indices .astype (np .int64 , copy = False )
558
-
559
- if encoded .num_chunks :
560
563
uniques = type (self )(encoded .chunk (0 ).dictionary )
561
- else :
562
- uniques = type (self )(pa .array ([], type = encoded .type .value_type ))
563
-
564
- return indices .values , uniques
564
+ return indices , uniques
565
565
566
566
def reshape (self , * args , ** kwargs ):
567
567
raise NotImplementedError (
0 commit comments