From b0e057adeb1ca83ae89b7ec7e48dd50b523eaac8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Oct 2019 14:44:50 -0700 Subject: [PATCH 1/3] CLN: further algorithms cleanup --- pandas/core/algorithms.py | 76 +++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8f72245b1f4eb..37fa5ef7c3486 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -77,27 +77,28 @@ def _ensure_data(values, dtype=None): Returns ------- - (ndarray, pandas_dtype, algo dtype as a string) + values : ndarray + pandas_dtype : str or dtype """ # we check some simple dtypes first if is_object_dtype(dtype): - return ensure_object(np.asarray(values)), "object", "object" + return ensure_object(np.asarray(values)), "object" elif is_object_dtype(values) and dtype is None: - return ensure_object(np.asarray(values)), "object", "object" + return ensure_object(np.asarray(values)), "object" try: if is_bool_dtype(values) or is_bool_dtype(dtype): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) - return np.asarray(values).astype("uint64"), "bool", "uint64" + return np.asarray(values).astype("uint64"), "bool" elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): - return ensure_int64(values), "int64", "int64" + return ensure_int64(values), "int64" elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): - return ensure_uint64(values), "uint64", "uint64" + return ensure_uint64(values), "uint64" elif is_float_dtype(values) or is_float_dtype(dtype): - return ensure_float64(values), "float64", "float64" + return ensure_float64(values), "float64" elif is_complex_dtype(values) or is_complex_dtype(dtype): # ignore the fact that we are casting to float @@ -105,12 +106,12 @@ def _ensure_data(values, dtype=None): with catch_warnings(): simplefilter("ignore", np.ComplexWarning) values = ensure_float64(values) - return values, "float64", "float64" + return values, "float64" except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype # and it is incompat this will fall thru to here - return ensure_object(values), "object", "object" + return ensure_object(values), "object" # datetimelike if ( @@ -136,14 +137,14 @@ def _ensure_data(values, dtype=None): # Note: this is reached by DataFrame.rank calls GH#27027 asi8 = values.view("i8") dtype = values.dtype - return asi8, dtype, "int64" + return asi8, dtype from pandas import DatetimeIndex values = DatetimeIndex(values) dtype = values.dtype - return values.asi8, dtype, "int64" + return values.asi8, dtype elif is_categorical_dtype(values) and ( is_categorical_dtype(dtype) or dtype is None @@ -156,11 +157,11 @@ def _ensure_data(values, dtype=None): # until our algos support int* directly (not all do) values = ensure_int64(values) - return values, dtype, "int64" + return values, dtype # we have failed, return object values = np.asarray(values, dtype=np.object) - return ensure_object(values), "object", "object" + return ensure_object(values), "object" def _reconstruct_data(values, dtype, original): @@ -228,7 +229,8 @@ def _get_hashtable_algo(values): values : ndarray dtype : str or dtype """ - values, dtype, ndtype = _ensure_data(values) + values, dtype = _ensure_data(values) + ndtype = values.dtype.name if ndtype == "object": @@ -246,12 +248,13 @@ def _get_values_for_rank(values): if is_categorical_dtype(values): values = values._values_for_rank() - values, _, ndtype = _ensure_data(values) - return values, ndtype + values, _ = _ensure_data(values) + return values def _get_data_algo(values): - values, ndtype = _get_values_for_rank(values) + values = _get_values_for_rank(values) + ndtype = values.dtype.name if ndtype == "object": @@ -284,16 +287,13 @@ def match(to_match, values, na_sentinel=-1): na_sentinel : int, default -1 Value to mark "not found" - Examples - -------- - Returns ------- match : ndarray of integers """ values = com.asarray_tuplesafe(values) htable, values, dtype = _get_hashtable_algo(values) - to_match, _, _ = _ensure_data(to_match, dtype) + to_match, _ = _ensure_data(to_match, dtype) table = htable(min(len(to_match), 1000000)) table.map_locations(values) result = table.lookup(to_match) @@ -445,11 +445,11 @@ def isin(comps, values): comps = com.values_from_object(comps) - comps, dtype, _ = _ensure_data(comps) - values, _, _ = _ensure_data(values, dtype=dtype) + comps, dtype = _ensure_data(comps) + values, _ = _ensure_data(values, dtype=dtype) # faster for larger cases to use np.in1d - f = lambda x, y: htable.ismember_object(x, values) + f = lambda x, y: htable.ismember_object(x, y) # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception @@ -655,7 +655,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: - values, dtype, _ = _ensure_data(values) + values, dtype = _ensure_data(values) if original.dtype.kind in ["m", "M"]: na_value = na_value_for_dtype(original.dtype) @@ -716,15 +716,14 @@ def value_counts( ------- Series """ - from pandas.core.series import Series, Index + from pandas.core.series import Series name = getattr(values, "name", None) if bins is not None: + from pandas.core.reshape.tile import cut + values = Series(values) try: - from pandas.core.reshape.tile import cut - - values = Series(values) ii = cut(values, bins, include_lowest=True) except TypeError: raise TypeError("bins argument only works with numeric data.") @@ -754,8 +753,6 @@ def value_counts( else: keys, counts = _value_counts_arraylike(values, dropna) - if not isinstance(keys, Index): - keys = Index(keys) result = Series(counts, index=keys, name=name) if sort: @@ -781,9 +778,10 @@ def _value_counts_arraylike(values, dropna): """ values = _ensure_arraylike(values) original = values - values, dtype, ndtype = _ensure_data(values) + values, dtype = _ensure_data(values) + ndtype = values.dtype.name - if needs_i8_conversion(dtype): + if needs_i8_conversion(original.dtype): # i8 keys, counts = htable.value_count_int64(values, dropna) @@ -830,7 +828,8 @@ def duplicated(values, keep="first"): duplicated : ndarray """ - values, _, ndtype = _ensure_data(values) + values, _ = _ensure_data(values) + ndtype = values.dtype.name f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) return f(values, keep=keep) @@ -867,7 +866,8 @@ def mode(values, dropna: bool = True): mask = values.isnull() values = values[~mask] - values, _, ndtype = _ensure_data(values) + values, _ = _ensure_data(values) + ndtype = values.dtype.name f = getattr(htable, "mode_{dtype}".format(dtype=ndtype)) result = f(values, dropna=dropna) @@ -905,7 +905,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ if values.ndim == 1: - values, _ = _get_values_for_rank(values) + values = _get_values_for_rank(values) ranks = algos.rank_1d( values, ties_method=method, @@ -914,7 +914,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct pct=pct, ) elif values.ndim == 2: - values, _ = _get_values_for_rank(values) + values = _get_values_for_rank(values) ranks = algos.rank_2d( values, axis=axis, @@ -1157,7 +1157,7 @@ def compute(self, method): return dropped[slc].sort_values(ascending=ascending).head(n) # fast method - arr, pandas_dtype, _ = _ensure_data(dropped.values) + arr, pandas_dtype = _ensure_data(dropped.values) if method == "nlargest": arr = -arr if is_integer_dtype(pandas_dtype): From 91c42785aafa3c75bc9445274dec51d39ab3c407 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Oct 2019 20:00:25 -0700 Subject: [PATCH 2/3] CLN: algos simplification, de-duplication, typing --- pandas/core/algorithms.py | 102 +++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 40 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 10eb44c2d8b47..adc1f734dbeb6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -79,7 +79,6 @@ def _ensure_data(values, dtype=None): ------- values : ndarray pandas_dtype : str or dtype - """ # we check some simple dtypes first @@ -229,16 +228,8 @@ def _get_hashtable_algo(values): values : ndarray """ values, _ = _ensure_data(values) - ndtype = values.dtype.name - - if ndtype == "object": - - # it's cheaper to use a String Hash Table than Object; we infer - # including nulls because that is the only difference between - # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ["string"]: - ndtype = "string" + ndtype = _check_object_for_strings(values) htable = _hashtables[ndtype] return htable, values @@ -253,8 +244,27 @@ def _get_values_for_rank(values): def _get_data_algo(values): values = _get_values_for_rank(values) - ndtype = values.dtype.name + ndtype = _check_object_for_strings(values) + htable = _hashtables.get(ndtype, _hashtables["object"]) + + return htable, values + + +def _check_object_for_strings(values) -> str: + """ + Check if we can use string hashtable instead of object hashtable. + + Parameters + ---------- + values : ndarray + ndtype : str + + Returns + ------- + str + """ + ndtype = values.dtype.name if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer @@ -262,10 +272,7 @@ def _get_data_algo(values): # StringHashTable and ObjectHashtable if lib.infer_dtype(values, skipna=False) in ["string"]: ndtype = "string" - - htable = _hashtables.get(ndtype, _hashtables["object"]) - - return htable, values + return ndtype # --------------- # @@ -370,9 +377,9 @@ def unique(values): unique1d = unique -def isin(comps, values): +def isin(comps, values) -> np.ndarray: """ - Compute the isin boolean array + Compute the isin boolean array. Parameters ---------- @@ -381,7 +388,8 @@ def isin(comps, values): Returns ------- - boolean array same length as comps + ndarray[bool] + Same length as `comps`. """ if not is_list_like(comps): @@ -413,17 +421,17 @@ def isin(comps, values): values, _ = _ensure_data(values, dtype=dtype) # faster for larger cases to use np.in1d - f = lambda x, y: htable.ismember_object(x, y) + f = htable.ismember_object # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception if len(comps) > 1000000 and not is_object_dtype(comps): - f = lambda x, y: np.in1d(x, y) + f = np.in1d elif is_integer_dtype(comps): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) - f = lambda x, y: htable.ismember_int64(x, y) + f = htable.ismember_int64 except (TypeError, ValueError, OverflowError): values = values.astype(object) comps = comps.astype(object) @@ -432,7 +440,7 @@ def isin(comps, values): try: values = values.astype("float64", copy=False) comps = comps.astype("float64", copy=False) - f = lambda x, y: htable.ismember_float64(x, y) + f = htable.ismember_float64 except (TypeError, ValueError): values = values.astype(object) comps = comps.astype(object) @@ -440,7 +448,7 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): +def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None): """ Factorize an array-like to labels and uniques. @@ -601,7 +609,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): ) @Appender(_shared_docs["factorize"]) @deprecate_kwarg(old_arg_name="order", new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): +def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) # 2.) factorizing labels and uniques @@ -657,7 +665,7 @@ def value_counts( normalize: bool = False, bins=None, dropna: bool = True, -): +) -> ABCSeries: """ Compute a histogram of the counts of non-null values. @@ -686,6 +694,7 @@ def value_counts( if bins is not None: from pandas.core.reshape.tile import cut + values = Series(values) try: ii = cut(values, bins, include_lowest=True) @@ -728,25 +737,25 @@ def value_counts( return result -def _value_counts_arraylike(values, dropna): +def _value_counts_arraylike(values, dropna: bool): """ Parameters ---------- values : arraylike - dropna : boolean + dropna : bool Returns ------- - (uniques, counts) - + uniques : np.ndarray or ExtensionArray + counts : np.ndarray """ values = _ensure_arraylike(values) original = values - values, dtype = _ensure_data(values) + values, _ = _ensure_data(values) ndtype = values.dtype.name if needs_i8_conversion(original.dtype): - # i8 + # datetime, timedelta, or period keys, counts = htable.value_count_int64(values, dropna) @@ -772,7 +781,7 @@ def _value_counts_arraylike(values, dropna): return keys, counts -def duplicated(values, keep="first"): +def duplicated(values, keep="first") -> np.ndarray: """ Return boolean ndarray denoting duplicate values. @@ -798,7 +807,7 @@ def duplicated(values, keep="first"): return f(values, keep=keep) -def mode(values, dropna: bool = True): +def mode(values, dropna: bool = True) -> ABCSeries: """ Returns the mode(s) of an array. @@ -844,7 +853,14 @@ def mode(values, dropna: bool = True): return Series(result) -def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct=False): +def rank( + values, + axis: int = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, +): """ Rank the values along a given axis. @@ -1058,7 +1074,7 @@ def _get_score(at): class SelectN: - def __init__(self, obj, n, keep): + def __init__(self, obj, n: int, keep: str): self.obj = obj self.n = n self.keep = keep @@ -1168,7 +1184,7 @@ class SelectNFrame(SelectN): nordered : DataFrame """ - def __init__(self, obj, n, keep, columns): + def __init__(self, obj, n: int, keep: str, columns): super().__init__(obj, n, keep) if not is_list_like(columns) or isinstance(columns, tuple): columns = [columns] @@ -1307,7 +1323,7 @@ def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): out[i, j] = arr[u_, v] -def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): +def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info else: @@ -1424,7 +1440,7 @@ def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): } -def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): +def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis: int = 0, mask_info=None): if ndim <= 2: tup = (arr_dtype.name, out_dtype.name) if ndim == 1: @@ -1458,7 +1474,7 @@ def func(arr, indexer, out, fill_value=np.nan): return func -def take(arr, indices, axis=0, allow_fill=False, fill_value=None): +def take(arr, indices, axis=0, allow_fill: bool = False, fill_value=None): """ Take elements from an array. @@ -1552,7 +1568,13 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): def take_nd( - arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True + arr, + indexer, + axis=0, + out=None, + fill_value=np.nan, + mask_info=None, + allow_fill: bool = True, ): """ Specialized Cython take which sets NaN values in one pass From e7f6c6f7de122de189bdb0ef64278393a19ae67c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Oct 2019 22:34:03 -0700 Subject: [PATCH 3/3] STY: edits to make mypy happy --- pandas/core/algorithms.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index adc1f734dbeb6..98a090ef26f2a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1465,13 +1465,13 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis: int = 0, mask_info=N func = _convert_wrapper(func, out_dtype) return func - def func(arr, indexer, out, fill_value=np.nan): + def func2(arr, indexer, out, fill_value=np.nan): indexer = ensure_int64(indexer) _take_nd_object( arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info ) - return func + return func2 def take(arr, indices, axis=0, allow_fill: bool = False, fill_value=None): @@ -1661,9 +1661,9 @@ def take_nd( # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value if out is None: - out_shape = list(arr.shape) - out_shape[axis] = len(indexer) - out_shape = tuple(out_shape) + out_shape_ = list(arr.shape) + out_shape_[axis] = len(indexer) + out_shape = tuple(out_shape_) if arr.flags.f_contiguous and axis == arr.ndim - 1: # minor tweak that can make an order-of-magnitude difference # for dataframes initialized directly from 2-d ndarrays