|
14 | 14 |
|
15 | 15 | from pandas.core.dtypes.cast import (
|
16 | 16 | construct_1d_object_array_from_listlike,
|
| 17 | + infer_dtype_from_array, |
17 | 18 | maybe_promote,
|
18 | 19 | )
|
19 | 20 | from pandas.core.dtypes.common import (
|
@@ -639,8 +640,6 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
|
639 | 640 | )
|
640 | 641 |
|
641 | 642 | if sort and len(uniques) > 0:
|
642 |
| - from pandas.core.sorting import safe_sort |
643 |
| - |
644 | 643 | uniques, labels = safe_sort(
|
645 | 644 | uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
|
646 | 645 | )
|
@@ -1910,3 +1909,138 @@ def diff(arr, n: int, axis: int = 0):
|
1910 | 1909 | out_arr = out_arr.astype("int64").view("timedelta64[ns]")
|
1911 | 1910 |
|
1912 | 1911 | return out_arr
|
| 1912 | + |
| 1913 | + |
| 1914 | +# -------------------------------------------------------------------- |
| 1915 | +# Helper functions |
| 1916 | + |
| 1917 | +# Note: safe_sort is in algorithms.py instead of sorting.py because it is |
| 1918 | +# low-dependency, is used in this module, and used private methods from |
| 1919 | +# this module. |
| 1920 | +def safe_sort( |
| 1921 | + values, |
| 1922 | + labels=None, |
| 1923 | + na_sentinel: int = -1, |
| 1924 | + assume_unique: bool = False, |
| 1925 | + verify: bool = True, |
| 1926 | +): |
| 1927 | + """ |
| 1928 | + Sort ``values`` and reorder corresponding ``labels``. |
| 1929 | + ``values`` should be unique if ``labels`` is not None. |
| 1930 | + Safe for use with mixed types (int, str), orders ints before strs. |
| 1931 | +
|
| 1932 | + Parameters |
| 1933 | + ---------- |
| 1934 | + values : list-like |
| 1935 | + Sequence; must be unique if ``labels`` is not None. |
| 1936 | + labels : list_like |
| 1937 | + Indices to ``values``. All out of bound indices are treated as |
| 1938 | + "not found" and will be masked with ``na_sentinel``. |
| 1939 | + na_sentinel : int, default -1 |
| 1940 | + Value in ``labels`` to mark "not found". |
| 1941 | + Ignored when ``labels`` is None. |
| 1942 | + assume_unique : bool, default False |
| 1943 | + When True, ``values`` are assumed to be unique, which can speed up |
| 1944 | + the calculation. Ignored when ``labels`` is None. |
| 1945 | + verify : bool, default True |
| 1946 | + Check if labels are out of bound for the values and put out of bound |
| 1947 | + labels equal to na_sentinel. If ``verify=False``, it is assumed there |
| 1948 | + are no out of bound labels. Ignored when ``labels`` is None. |
| 1949 | +
|
| 1950 | + .. versionadded:: 0.25.0 |
| 1951 | +
|
| 1952 | + Returns |
| 1953 | + ------- |
| 1954 | + ordered : ndarray |
| 1955 | + Sorted ``values`` |
| 1956 | + new_labels : ndarray |
| 1957 | + Reordered ``labels``; returned when ``labels`` is not None. |
| 1958 | +
|
| 1959 | + Raises |
| 1960 | + ------ |
| 1961 | + TypeError |
| 1962 | + * If ``values`` is not list-like or if ``labels`` is neither None |
| 1963 | + nor list-like |
| 1964 | + * If ``values`` cannot be sorted |
| 1965 | + ValueError |
| 1966 | + * If ``labels`` is not None and ``values`` contain duplicates. |
| 1967 | + """ |
| 1968 | + if not is_list_like(values): |
| 1969 | + raise TypeError( |
| 1970 | + "Only list-like objects are allowed to be passed to safe_sort as values" |
| 1971 | + ) |
| 1972 | + |
| 1973 | + if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): |
| 1974 | + # don't convert to string types |
| 1975 | + dtype, _ = infer_dtype_from_array(values) |
| 1976 | + values = np.asarray(values, dtype=dtype) |
| 1977 | + |
| 1978 | + def sort_mixed(values): |
| 1979 | + # order ints before strings, safe in py3 |
| 1980 | + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) |
| 1981 | + nums = np.sort(values[~str_pos]) |
| 1982 | + strs = np.sort(values[str_pos]) |
| 1983 | + return np.concatenate([nums, np.asarray(strs, dtype=object)]) |
| 1984 | + |
| 1985 | + sorter = None |
| 1986 | + if ( |
| 1987 | + not is_extension_array_dtype(values) |
| 1988 | + and lib.infer_dtype(values, skipna=False) == "mixed-integer" |
| 1989 | + ): |
| 1990 | + # unorderable in py3 if mixed str/int |
| 1991 | + ordered = sort_mixed(values) |
| 1992 | + else: |
| 1993 | + try: |
| 1994 | + sorter = values.argsort() |
| 1995 | + ordered = values.take(sorter) |
| 1996 | + except TypeError: |
| 1997 | + # try this anyway |
| 1998 | + ordered = sort_mixed(values) |
| 1999 | + |
| 2000 | + # labels: |
| 2001 | + |
| 2002 | + if labels is None: |
| 2003 | + return ordered |
| 2004 | + |
| 2005 | + if not is_list_like(labels): |
| 2006 | + raise TypeError( |
| 2007 | + "Only list-like objects or None are allowed to be" |
| 2008 | + "passed to safe_sort as labels" |
| 2009 | + ) |
| 2010 | + labels = ensure_platform_int(np.asarray(labels)) |
| 2011 | + |
| 2012 | + from pandas import Index |
| 2013 | + |
| 2014 | + if not assume_unique and not Index(values).is_unique: |
| 2015 | + raise ValueError("values should be unique if labels is not None") |
| 2016 | + |
| 2017 | + if sorter is None: |
| 2018 | + # mixed types |
| 2019 | + hash_klass, values = _get_data_algo(values) |
| 2020 | + t = hash_klass(len(values)) |
| 2021 | + t.map_locations(values) |
| 2022 | + sorter = ensure_platform_int(t.lookup(ordered)) |
| 2023 | + |
| 2024 | + if na_sentinel == -1: |
| 2025 | + # take_1d is faster, but only works for na_sentinels of -1 |
| 2026 | + order2 = sorter.argsort() |
| 2027 | + new_labels = take_1d(order2, labels, fill_value=-1) |
| 2028 | + if verify: |
| 2029 | + mask = (labels < -len(values)) | (labels >= len(values)) |
| 2030 | + else: |
| 2031 | + mask = None |
| 2032 | + else: |
| 2033 | + reverse_indexer = np.empty(len(sorter), dtype=np.int_) |
| 2034 | + reverse_indexer.put(sorter, np.arange(len(sorter))) |
| 2035 | + # Out of bound indices will be masked with `na_sentinel` next, so we |
| 2036 | + # may deal with them here without performance loss using `mode='wrap'` |
| 2037 | + new_labels = reverse_indexer.take(labels, mode="wrap") |
| 2038 | + |
| 2039 | + mask = labels == na_sentinel |
| 2040 | + if verify: |
| 2041 | + mask = mask | (labels < -len(values)) | (labels >= len(values)) |
| 2042 | + |
| 2043 | + if mask is not None: |
| 2044 | + np.putmask(new_labels, mask, na_sentinel) |
| 2045 | + |
| 2046 | + return ordered, ensure_platform_int(new_labels) |
0 commit comments