Skip to content

CLN: simplify core.algorithms #29199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 35 additions & 42 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from pandas.core.dtypes.missing import isna, na_value_for_dtype

from pandas.core import common as com
from pandas.core.construction import array
from pandas.core.construction import array, extract_array
from pandas.core.indexers import validate_indices

_shared_docs = {} # type: Dict[str, str]
Expand Down Expand Up @@ -82,9 +82,12 @@ def _ensure_data(values, dtype=None):
"""

# we check some simple dtypes first
if is_object_dtype(dtype):
return ensure_object(np.asarray(values)), "object", "object"
elif is_object_dtype(values) and dtype is None:
return ensure_object(np.asarray(values)), "object", "object"

try:
if is_object_dtype(dtype):
return ensure_object(np.asarray(values)), "object", "object"
if is_bool_dtype(values) or is_bool_dtype(dtype):
# we are actually coercing to uint64
# until our algos support uint8 directly (see TODO)
Expand All @@ -95,8 +98,6 @@ def _ensure_data(values, dtype=None):
return ensure_uint64(values), "uint64", "uint64"
elif is_float_dtype(values) or is_float_dtype(dtype):
return ensure_float64(values), "float64", "float64"
elif is_object_dtype(values) and dtype is None:
return ensure_object(np.asarray(values)), "object", "object"
elif is_complex_dtype(values) or is_complex_dtype(dtype):

# ignore the fact that we are casting to float
Expand Down Expand Up @@ -207,11 +208,11 @@ def _ensure_arraylike(values):


_hashtables = {
"float64": (htable.Float64HashTable, htable.Float64Vector),
"uint64": (htable.UInt64HashTable, htable.UInt64Vector),
"int64": (htable.Int64HashTable, htable.Int64Vector),
"string": (htable.StringHashTable, htable.ObjectVector),
"object": (htable.PyObjectHashTable, htable.ObjectVector),
"float64": htable.Float64HashTable,
"uint64": htable.UInt64HashTable,
"int64": htable.Int64HashTable,
"string": htable.StringHashTable,
"object": htable.PyObjectHashTable,
}


Expand All @@ -223,11 +224,9 @@ def _get_hashtable_algo(values):

Returns
-------
tuples(hashtable class,
vector class,
values,
dtype,
ndtype)
htable : HashTable subclass
values : ndarray
dtype : str or dtype
"""
values, dtype, ndtype = _ensure_data(values)

Expand All @@ -238,23 +237,21 @@ def _get_hashtable_algo(values):
# StringHashTable and ObjectHashtable
if lib.infer_dtype(values, skipna=False) in ["string"]:
ndtype = "string"
else:
ndtype = "object"

htable, table = _hashtables[ndtype]
return (htable, table, values, dtype, ndtype)
htable = _hashtables[ndtype]
return htable, values, dtype


def _get_values_for_rank(values):
if is_categorical_dtype(values):
values = values._values_for_rank()

values, dtype, ndtype = _ensure_data(values)
return values, dtype, ndtype
values, _, ndtype = _ensure_data(values)
return values, ndtype


def _get_data_algo(values, func_map):
values, dtype, ndtype = _get_values_for_rank(values)
def _get_data_algo(values):
values, ndtype = _get_values_for_rank(values)

if ndtype == "object":

Expand All @@ -264,7 +261,7 @@ def _get_data_algo(values, func_map):
if lib.infer_dtype(values, skipna=False) in ["string"]:
ndtype = "string"

f = func_map.get(ndtype, func_map["object"])
f = _hashtables.get(ndtype, _hashtables["object"])

return f, values

Expand Down Expand Up @@ -295,7 +292,7 @@ def match(to_match, values, na_sentinel=-1):
match : ndarray of integers
"""
values = com.asarray_tuplesafe(values)
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
htable, values, dtype = _get_hashtable_algo(values)
to_match, _, _ = _ensure_data(to_match, dtype)
table = htable(min(len(to_match), 1000000))
table.map_locations(values)
Expand Down Expand Up @@ -398,7 +395,7 @@ def unique(values):
return values.unique()

original = values
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
htable, values, _ = _get_hashtable_algo(values)

table = htable(len(values))
uniques = table.unique(values)
Expand Down Expand Up @@ -480,7 +477,8 @@ def isin(comps, values):


def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None):
"""Factorize an array-like to labels and uniques.
"""
Factorize an array-like to labels and uniques.

This doesn't do any coercion of types or unboxing before factorization.

Expand All @@ -498,9 +496,10 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None):

Returns
-------
labels, uniques : ndarray
labels : ndarray
uniques : ndarray
"""
(hash_klass, _), values = _get_data_algo(values, _hashtables)
hash_klass, values = _get_data_algo(values)

table = hash_klass(size_hint or len(values))
uniques, labels = table.factorize(
Expand Down Expand Up @@ -652,17 +651,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
original = values

if is_extension_array_dtype(values):
values = getattr(values, "_values", values)
values = extract_array(values)
labels, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
values, dtype, _ = _ensure_data(values)

if (
is_datetime64_any_dtype(original)
or is_timedelta64_dtype(original)
or is_period_dtype(original)
):
if original.dtype.kind in ["m", "M"]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps a silly question but wouldn't the datetimelike arrays pass through the is_extension_array_dtype branch here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the period and dt64tz go through that EA branch above, td64 and dt64-naive go through here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment to that effect. (can also merge this and do that later)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will comment in next pass

na_value = na_value_for_dtype(original.dtype)
else:
na_value = None
Expand Down Expand Up @@ -831,7 +826,7 @@ def duplicated(values, keep="first"):
duplicated : ndarray
"""

values, dtype, ndtype = _ensure_data(values)
values, _, ndtype = _ensure_data(values)
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
return f(values, keep=keep)

Expand Down Expand Up @@ -868,7 +863,7 @@ def mode(values, dropna: bool = True):
mask = values.isnull()
values = values[~mask]

values, dtype, ndtype = _ensure_data(values)
values, _, ndtype = _ensure_data(values)

f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
result = f(values, dropna=dropna)
Expand Down Expand Up @@ -906,7 +901,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
"""
if values.ndim == 1:
values, _, _ = _get_values_for_rank(values)
values, _ = _get_values_for_rank(values)
ranks = algos.rank_1d(
values,
ties_method=method,
Expand All @@ -915,7 +910,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct
pct=pct,
)
elif values.ndim == 2:
values, _, _ = _get_values_for_rank(values)
values, _ = _get_values_for_rank(values)
ranks = algos.rank_2d(
values,
axis=axis,
Expand Down Expand Up @@ -1630,9 +1625,7 @@ def take_nd(
if is_extension_array_dtype(arr):
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)

if isinstance(arr, (ABCIndexClass, ABCSeries)):
arr = arr._values

arr = extract_array(arr)
arr = np.asarray(arr)

if indexer is None:
Expand Down
14 changes: 3 additions & 11 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,7 @@
from pandas.core import ops
from pandas.core.accessor import PandasDelegate, delegate_names
import pandas.core.algorithms as algorithms
from pandas.core.algorithms import (
_get_data_algo,
_hashtables,
factorize,
take,
take_1d,
unique1d,
)
from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.construction import array, extract_array, sanitize_array
Expand Down Expand Up @@ -2097,7 +2090,6 @@ def __setitem__(self, key, value):
"""
Item assignment.
Raises
------
ValueError
Expand Down Expand Up @@ -2631,8 +2623,8 @@ def _get_codes_for_values(values, categories):
values = ensure_object(values)
categories = ensure_object(categories)

(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
(_, _), cats = _get_data_algo(categories, _hashtables)
hash_klass, vals = _get_data_algo(values)
_, cats = _get_data_algo(categories)
t = hash_klass(len(cats))
t.map_locations(cats)
return coerce_indexer_dtype(t.lookup(vals), cats)
Expand Down
11 changes: 3 additions & 8 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
ensure_str,
is_bool,
is_bool_dtype,
is_categorical_dtype,
is_complex,
is_complex_dtype,
is_datetime64_dtype,
Expand Down Expand Up @@ -1325,14 +1324,10 @@ def construct_1d_arraylike_from_scalar(value, length, dtype):
np.ndarray / pandas type of length, filled with value
"""
if is_datetime64tz_dtype(dtype):
from pandas import DatetimeIndex

subarr = DatetimeIndex([value] * length, dtype=dtype)
elif is_categorical_dtype(dtype):
from pandas import Categorical
if is_extension_array_dtype(dtype):
cls = dtype.construct_array_type()
subarr = cls._from_sequence([value] * length, dtype=dtype)

subarr = Categorical([value] * length, dtype=dtype)
else:
if not isinstance(dtype, (np.dtype, type(np.dtype))):
dtype = dtype.dtype
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,9 +484,7 @@ def sort_mixed(values):

if sorter is None:
# mixed types
(hash_klass, _), values = algorithms._get_data_algo(
values, algorithms._hashtables
)
hash_klass, values = algorithms._get_data_algo(values)
t = hash_klass(len(values))
t.map_locations(values)
sorter = ensure_platform_int(t.lookup(ordered))
Expand Down