diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2c9f632e8bc24..e64290a196523 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -47,7 +47,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com -from pandas.core.construction import array +from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices _shared_docs = {} # type: Dict[str, str] @@ -82,9 +82,12 @@ def _ensure_data(values, dtype=None): """ # we check some simple dtypes first + if is_object_dtype(dtype): + return ensure_object(np.asarray(values)), "object", "object" + elif is_object_dtype(values) and dtype is None: + return ensure_object(np.asarray(values)), "object", "object" + try: - if is_object_dtype(dtype): - return ensure_object(np.asarray(values)), "object", "object" if is_bool_dtype(values) or is_bool_dtype(dtype): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) @@ -95,8 +98,6 @@ def _ensure_data(values, dtype=None): return ensure_uint64(values), "uint64", "uint64" elif is_float_dtype(values) or is_float_dtype(dtype): return ensure_float64(values), "float64", "float64" - elif is_object_dtype(values) and dtype is None: - return ensure_object(np.asarray(values)), "object", "object" elif is_complex_dtype(values) or is_complex_dtype(dtype): # ignore the fact that we are casting to float @@ -207,11 +208,11 @@ def _ensure_arraylike(values): _hashtables = { - "float64": (htable.Float64HashTable, htable.Float64Vector), - "uint64": (htable.UInt64HashTable, htable.UInt64Vector), - "int64": (htable.Int64HashTable, htable.Int64Vector), - "string": (htable.StringHashTable, htable.ObjectVector), - "object": (htable.PyObjectHashTable, htable.ObjectVector), + "float64": htable.Float64HashTable, + "uint64": htable.UInt64HashTable, + "int64": htable.Int64HashTable, + "string": htable.StringHashTable, + "object": htable.PyObjectHashTable, } @@ -223,11 +224,9 @@ def _get_hashtable_algo(values): Returns ------- - tuples(hashtable class, - vector class, - values, - dtype, - ndtype) + htable : HashTable subclass + values : ndarray + dtype : str or dtype """ values, dtype, ndtype = _ensure_data(values) @@ -238,23 +237,21 @@ def _get_hashtable_algo(values): # StringHashTable and ObjectHashtable if lib.infer_dtype(values, skipna=False) in ["string"]: ndtype = "string" - else: - ndtype = "object" - htable, table = _hashtables[ndtype] - return (htable, table, values, dtype, ndtype) + htable = _hashtables[ndtype] + return htable, values, dtype def _get_values_for_rank(values): if is_categorical_dtype(values): values = values._values_for_rank() - values, dtype, ndtype = _ensure_data(values) - return values, dtype, ndtype + values, _, ndtype = _ensure_data(values) + return values, ndtype -def _get_data_algo(values, func_map): - values, dtype, ndtype = _get_values_for_rank(values) +def _get_data_algo(values): + values, ndtype = _get_values_for_rank(values) if ndtype == "object": @@ -264,7 +261,7 @@ def _get_data_algo(values, func_map): if lib.infer_dtype(values, skipna=False) in ["string"]: ndtype = "string" - f = func_map.get(ndtype, func_map["object"]) + f = _hashtables.get(ndtype, _hashtables["object"]) return f, values @@ -295,7 +292,7 @@ def match(to_match, values, na_sentinel=-1): match : ndarray of integers """ values = com.asarray_tuplesafe(values) - htable, _, values, dtype, ndtype = _get_hashtable_algo(values) + htable, values, dtype = _get_hashtable_algo(values) to_match, _, _ = _ensure_data(to_match, dtype) table = htable(min(len(to_match), 1000000)) table.map_locations(values) @@ -398,7 +395,7 @@ def unique(values): return values.unique() original = values - htable, _, values, dtype, ndtype = _get_hashtable_algo(values) + htable, values, _ = _get_hashtable_algo(values) table = htable(len(values)) uniques = table.unique(values) @@ -480,7 +477,8 @@ def isin(comps, values): def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): - """Factorize an array-like to labels and uniques. + """ + Factorize an array-like to labels and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -498,9 +496,10 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): Returns ------- - labels, uniques : ndarray + labels : ndarray + uniques : ndarray """ - (hash_klass, _), values = _get_data_algo(values, _hashtables) + hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) uniques, labels = table.factorize( @@ -652,17 +651,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): original = values if is_extension_array_dtype(values): - values = getattr(values, "_values", values) + values = extract_array(values) labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - if ( - is_datetime64_any_dtype(original) - or is_timedelta64_dtype(original) - or is_period_dtype(original) - ): + if original.dtype.kind in ["m", "M"]: na_value = na_value_for_dtype(original.dtype) else: na_value = None @@ -831,7 +826,7 @@ def duplicated(values, keep="first"): duplicated : ndarray """ - values, dtype, ndtype = _ensure_data(values) + values, _, ndtype = _ensure_data(values) f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) return f(values, keep=keep) @@ -868,7 +863,7 @@ def mode(values, dropna: bool = True): mask = values.isnull() values = values[~mask] - values, dtype, ndtype = _ensure_data(values) + values, _, ndtype = _ensure_data(values) f = getattr(htable, "mode_{dtype}".format(dtype=ndtype)) result = f(values, dropna=dropna) @@ -906,7 +901,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ if values.ndim == 1: - values, _, _ = _get_values_for_rank(values) + values, _ = _get_values_for_rank(values) ranks = algos.rank_1d( values, ties_method=method, @@ -915,7 +910,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct pct=pct, ) elif values.ndim == 2: - values, _, _ = _get_values_for_rank(values) + values, _ = _get_values_for_rank(values) ranks = algos.rank_2d( values, axis=axis, @@ -1630,9 +1625,7 @@ def take_nd( if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - if isinstance(arr, (ABCIndexClass, ABCSeries)): - arr = arr._values - + arr = extract_array(arr) arr = np.asarray(arr) if indexer is None: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 70ed411f6a3e4..4d065bd234e0b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -47,14 +47,7 @@ from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms -from pandas.core.algorithms import ( - _get_data_algo, - _hashtables, - factorize, - take, - take_1d, - unique1d, -) +from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array @@ -2097,7 +2090,6 @@ def __setitem__(self, key, value): """ Item assignment. - Raises ------ ValueError @@ -2631,8 +2623,8 @@ def _get_codes_for_values(values, categories): values = ensure_object(values) categories = ensure_object(categories) - (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - (_, _), cats = _get_data_algo(categories, _hashtables) + hash_klass, vals = _get_data_algo(values) + _, cats = _get_data_algo(categories) t = hash_klass(len(cats)) t.map_locations(cats) return coerce_indexer_dtype(t.lookup(vals), cats) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7fcaf60088ad2..3e92906be706c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -21,7 +21,6 @@ ensure_str, is_bool, is_bool_dtype, - is_categorical_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, @@ -1325,14 +1324,10 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): np.ndarray / pandas type of length, filled with value """ - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - - subarr = DatetimeIndex([value] * length, dtype=dtype) - elif is_categorical_dtype(dtype): - from pandas import Categorical + if is_extension_array_dtype(dtype): + cls = dtype.construct_array_type() + subarr = cls._from_sequence([value] * length, dtype=dtype) - subarr = Categorical([value] * length, dtype=dtype) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 94810369785d3..706f6159bcafe 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -484,9 +484,7 @@ def sort_mixed(values): if sorter is None: # mixed types - (hash_klass, _), values = algorithms._get_data_algo( - values, algorithms._hashtables - ) + hash_klass, values = algorithms._get_data_algo(values) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered))