Skip to content

TYP: annotate core.algorithms #33944

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 6, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 27 additions & 18 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from pandas._libs import Timestamp, algos, hashtable as htable, lib
from pandas._libs.tslib import iNaT
from pandas._typing import AnyArrayLike, DtypeObj
from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj
from pandas.util._decorators import doc

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -44,6 +44,7 @@
is_timedelta64_dtype,
is_unsigned_integer_dtype,
needs_i8_conversion,
pandas_dtype,
)
from pandas.core.dtypes.generic import (
ABCExtensionArray,
Expand All @@ -66,7 +67,9 @@
# --------------- #
# dtype access #
# --------------- #
def _ensure_data(values, dtype=None):
def _ensure_data(
values, dtype: Optional[DtypeObj] = None
) -> Tuple[np.ndarray, DtypeObj]:
"""
routine to ensure that our data is of the correct
input dtype for lower-level routines
Expand All @@ -88,42 +91,43 @@ def _ensure_data(values, dtype=None):
Returns
-------
values : ndarray
pandas_dtype : str or dtype
pandas_dtype : np.dtype or ExtensionDtype
"""

if not isinstance(values, ABCMultiIndex):
# extract_array would raise
values = extract_array(values, extract_numpy=True)

# we check some simple dtypes first
if is_object_dtype(dtype):
return ensure_object(np.asarray(values)), "object"
return ensure_object(np.asarray(values)), np.dtype("object")
elif is_object_dtype(values) and dtype is None:
return ensure_object(np.asarray(values)), "object"
return ensure_object(np.asarray(values)), np.dtype("object")

try:
if is_bool_dtype(values) or is_bool_dtype(dtype):
# we are actually coercing to uint64
# until our algos support uint8 directly (see TODO)
return np.asarray(values).astype("uint64"), "bool"
return np.asarray(values).astype("uint64"), np.dtype("bool")
elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
return ensure_int64(values), "int64"
return ensure_int64(values), np.dtype("int64")
elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype):
return ensure_uint64(values), "uint64"
return ensure_uint64(values), np.dtype("uint64")
elif is_float_dtype(values) or is_float_dtype(dtype):
return ensure_float64(values), "float64"
return ensure_float64(values), np.dtype("float64")
elif is_complex_dtype(values) or is_complex_dtype(dtype):

# ignore the fact that we are casting to float
# which discards complex parts
with catch_warnings():
simplefilter("ignore", np.ComplexWarning)
values = ensure_float64(values)
return values, "float64"
return values, np.dtype("float64")

except (TypeError, ValueError, OverflowError):
# if we are trying to coerce to a dtype
# and it is incompat this will fall through to here
return ensure_object(values), "object"
return ensure_object(values), np.dtype("object")

# datetimelike
vals_dtype = getattr(values, "dtype", None)
Expand Down Expand Up @@ -159,7 +163,7 @@ def _ensure_data(values, dtype=None):
is_categorical_dtype(dtype) or dtype is None
):
values = values.codes
dtype = "category"
dtype = pandas_dtype("category")

# we are actually coercing to int64
# until our algos support int* directly (not all do)
Expand All @@ -169,22 +173,24 @@ def _ensure_data(values, dtype=None):

# we have failed, return object
values = np.asarray(values, dtype=np.object)
return ensure_object(values), "object"
return ensure_object(values), np.dtype("object")


def _reconstruct_data(values, dtype, original):
def _reconstruct_data(
values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike
) -> ArrayLike:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ArrayLike is a typevar. is the return type here always the same as the type of values? or dependant on dtype.

maybe just expand on the prose in the docstring till this bites us.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ArrayLike is a typevar. is the return type here always the same as the type of values? or dependant on dtype.

Yikes, I think I've been using it incorrectly in a lot of places then. Ive been using it as a synonym for Union[np.ndarray, ExtensionArray]. I guess I'll do a dedicated pass through the code to weed out those mis-usages

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ive been using it as a synonym for Union[np.ndarray, ExtensionArray]

This is OK if the alias only appears once in the function signature/return.

once it appears twice, then they are bound.

"""
reverse of _ensure_data

Parameters
----------
values : ndarray
dtype : pandas_dtype
original : ndarray-like
values : np.ndarray or ExtensionArray
dtype : np.ndtype or ExtensionDtype
original : AnyArrayLike
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick, values and dtype are 'expanded' aliases, do the same for AnyArrayLike?

again maybe more prose, is Index only allowed with bool_dtype? what about Series?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICT it can be Index or Series regardless of dtype, whatever was passed to the top-level fucntion


Returns
-------
Index for extension types, otherwise ndarray casted to dtype
ExtensionArray or np.ndarray
"""
if is_extension_array_dtype(dtype):
values = dtype.construct_array_type()._from_sequence(values)
Expand Down Expand Up @@ -416,6 +422,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:

if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
values = construct_1d_object_array_from_listlike(list(values))
# TODO: could use ensure_arraylike here

comps = extract_array(comps, extract_numpy=True)
if is_categorical_dtype(comps):
Expand Down Expand Up @@ -729,6 +736,7 @@ def value_counts(
return result


# Called once from SparseArray
def _value_counts_arraylike(values, dropna: bool):
"""
Parameters
Expand Down Expand Up @@ -823,6 +831,7 @@ def mode(values, dropna: bool = True) -> "Series":
# categorical is a fast-path
if is_categorical_dtype(values):
if isinstance(values, Series):
# TODO: should we be passing `name` below?
return Series(values._values.mode(dropna=dropna), name=values.name)
return values.mode(dropna=dropna)

Expand Down