diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 80664a9ba3019..244f882f2c103 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,30 +8,22 @@ from pandas import compat, _np_version_under1p8 from pandas.types.cast import maybe_promote -from pandas.types.generic import ABCSeries, ABCIndex -from pandas.types.common import (is_unsigned_integer_dtype, - is_signed_integer_dtype, - is_integer_dtype, - is_complex_dtype, - is_categorical_dtype, - is_extension_type, - is_datetimetz, - is_period_dtype, - is_period_arraylike, - is_numeric_dtype, - is_float_dtype, - is_bool_dtype, - needs_i8_conversion, - is_categorical, - is_datetime64_dtype, - is_timedelta64_dtype, - is_scalar, - _ensure_platform_int, - _ensure_object, - _ensure_float64, - _ensure_uint64, - _ensure_int64, - is_list_like) +from pandas.types.generic import (ABCSeries, ABCIndex, + ABCIndexClass, ABCCategorical) +from pandas.types.common import ( + is_unsigned_integer_dtype, is_signed_integer_dtype, + is_integer_dtype, is_complex_dtype, + is_categorical_dtype, is_sparse, + is_period_dtype, + is_numeric_dtype, is_float_dtype, + is_bool_dtype, needs_i8_conversion, + is_categorical, is_datetimetz, + is_datetime64_any_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, + is_scalar, is_list_like, + _ensure_platform_int, _ensure_object, + _ensure_float64, _ensure_uint64, + _ensure_int64) from pandas.compat.numpy import _np_version_under1p10 from pandas.types.missing import isnull @@ -45,40 +37,190 @@ # dtype access # # --------------- # -def _ensure_data_view(values): +def _ensure_data(values, dtype=None): """ - helper routine to ensure that our data is of the correct + routine to ensure that our data is of the correct input dtype for lower-level routines + This will coerce: + - ints -> int64 + - uint -> uint64 + - bool -> uint64 (TODO this should be uint8) + - datetimelike -> i8 + - datetime64tz -> i8 (in local tz) + - categorical -> codes + Parameters ---------- values : array-like + dtype : pandas_dtype, optional + coerce to this dtype + + Returns + ------- + (ndarray, pandas_dtype, algo dtype as a string) + """ - if needs_i8_conversion(values): - values = values.view(np.int64) - elif is_period_arraylike(values): - from pandas.tseries.period import PeriodIndex - values = PeriodIndex(values).asi8 - elif is_categorical_dtype(values): - values = values.values.codes - elif isinstance(values, (ABCSeries, ABCIndex)): - values = values.values - - if is_signed_integer_dtype(values): + if (needs_i8_conversion(values) or + is_period_dtype(dtype) or + is_datetime64_any_dtype(dtype) or + is_timedelta64_dtype(dtype)): + if is_period_dtype(values) or is_period_dtype(dtype): + from pandas import PeriodIndex + values = PeriodIndex(values) + dtype = values.dtype + elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype): + from pandas import TimedeltaIndex + values = TimedeltaIndex(values) + dtype = values.dtype + else: + # Datetime + from pandas import DatetimeIndex + values = DatetimeIndex(values) + dtype = values.dtype + + return values.asi8, dtype, 'int64' + + elif is_categorical_dtype(values) or is_categorical_dtype(dtype): + values = getattr(values, 'values', values) + values = values.codes + dtype = 'category' + + # we are actually coercing to int64 + # until our algos suppport int* directly (not all do) values = _ensure_int64(values) - elif is_unsigned_integer_dtype(values): - values = _ensure_uint64(values) - elif is_complex_dtype(values): - values = _ensure_float64(values) - elif is_float_dtype(values): - values = _ensure_float64(values) - else: + + return values, dtype, 'int64' + + values = np.asarray(values) + + try: + if is_bool_dtype(values) or is_bool_dtype(dtype): + # we are actually coercing to uint64 + # until our algos suppport uint8 directly (see TODO) + values = values.astype('uint64') + dtype = 'bool' + ndtype = 'uint64' + elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): + values = _ensure_int64(values) + ndtype = dtype = 'int64' + elif (is_unsigned_integer_dtype(values) or + is_unsigned_integer_dtype(dtype)): + values = _ensure_uint64(values) + ndtype = dtype = 'uint64' + elif is_complex_dtype(values) or is_complex_dtype(dtype): + values = _ensure_float64(values) + ndtype = dtype = 'float64' + elif is_float_dtype(values) or is_float_dtype(dtype): + values = _ensure_float64(values) + ndtype = dtype = 'float64' + else: + values = _ensure_object(values) + ndtype = dtype = 'object' + + except (TypeError, ValueError): + # if we are trying to coerce to a dtype + # and it is incompat this will fall thru to here values = _ensure_object(values) + ndtype = dtype = 'object' + + return values, dtype, ndtype + + +def _reconstruct_data(values, dtype, original): + """ + reverse of _ensure_data + + Parameters + ---------- + values : ndarray + dtype : pandas_dtype + original : ndarray-like + + Returns + ------- + Index for extension types, otherwise ndarray casted to dtype + + """ + from pandas import Index + if is_categorical_dtype(dtype): + pass + elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): + values = Index(original)._shallow_copy(values, name=None) + elif dtype is not None: + values = values.astype(dtype) return values +def _ensure_arraylike(values): + """ + ensure that we are arraylike if not already + """ + if not isinstance(values, (np.ndarray, ABCCategorical, + ABCIndexClass, ABCSeries)): + values = np.array(values) + return values + + +_hashtables = { + 'float64': (htable.Float64HashTable, htable.Float64Vector), + 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), + 'int64': (htable.Int64HashTable, htable.Int64Vector), + 'string': (htable.StringHashTable, htable.ObjectVector), + 'object': (htable.PyObjectHashTable, htable.ObjectVector) +} + + +def _get_hashtable_algo(values): + """ + Parameters + ---------- + values : arraylike + + Returns + ------- + tuples(hashtable class, + vector class, + values, + dtype, + ndtype) + """ + values, dtype, ndtype = _ensure_data(values) + + if ndtype == 'object': + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + ndtype = 'string' + else: + ndtype = 'object' + + htable, table = _hashtables[ndtype] + return (htable, table, values, dtype, ndtype) + + +def _get_data_algo(values, func_map): + + if is_categorical_dtype(values): + values = values._values_for_rank() + + values, dtype, ndtype = _ensure_data(values) + if ndtype == 'object': + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + try: + f = func_map['string'] + except KeyError: + pass + + f = func_map.get(ndtype, func_map['object']) + + return f, values + + # --------------- # # top-level algos # # --------------- # @@ -104,92 +246,41 @@ def match(to_match, values, na_sentinel=-1): match : ndarray of integers """ values = com._asarray_tuplesafe(values) - if issubclass(values.dtype.type, string_types): - values = np.array(values, dtype='O') - - f = lambda htype, caster: _match_object(to_match, values, htype, caster) - result = _hashtable_algo(f, values, np.int64) + htable, _, values, dtype, ndtype = _get_hashtable_algo(values) + to_match, _, _ = _ensure_data(to_match, dtype) + table = htable(min(len(to_match), 1000000)) + table.map_locations(values) + result = table.lookup(to_match) if na_sentinel != -1: # replace but return a numpy array # use a Series because it handles dtype conversions properly - from pandas.core.series import Series + from pandas import Series result = Series(result.ravel()).replace(-1, na_sentinel).values.\ reshape(result.shape) return result -def _match_object(values, index, table_type, type_caster): - values = type_caster(values) - index = type_caster(index) - table = table_type(min(len(index), 1000000)) - table.map_locations(index) - return table.lookup(values) - - -def unique(values): - """ - Compute unique values (not necessarily sorted) efficiently from input array - of values - - Parameters - ---------- - values : array-like - - Returns - ------- - uniques - """ - values = com._asarray_tuplesafe(values) - - f = lambda htype, caster: _unique_object(values, htype, caster) - return _hashtable_algo(f, values) - - -def _unique_object(values, table_type, type_caster): - values = type_caster(values) - table = table_type(min(len(values), 1000000)) - uniques = table.unique(values) - return type_caster(uniques) - - def unique1d(values): """ Hash table-based unique """ - if np.issubdtype(values.dtype, np.floating): - table = htable.Float64HashTable(len(values)) - uniques = np.array(table.unique(_ensure_float64(values)), - dtype=np.float64) - elif np.issubdtype(values.dtype, np.datetime64): - table = htable.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('M8[ns]') - elif np.issubdtype(values.dtype, np.timedelta64): - table = htable.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('m8[ns]') - elif np.issubdtype(values.dtype, np.signedinteger): - table = htable.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - elif np.issubdtype(values.dtype, np.unsignedinteger): - table = htable.UInt64HashTable(len(values)) - uniques = table.unique(_ensure_uint64(values)) - else: - - # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: - table = htable.StringHashTable(len(values)) - else: - table = htable.PyObjectHashTable(len(values)) + values = _ensure_arraylike(values) + original = values + htable, _, values, dtype, ndtype = _get_hashtable_algo(values) - uniques = table.unique(_ensure_object(values)) + table = htable(len(values)) + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, dtype, original) return uniques +unique = unique1d + + def isin(comps, values): """ Compute the isin boolean array @@ -213,38 +304,11 @@ def isin(comps, values): " to isin(), you passed a " "[{0}]".format(type(values).__name__)) - from pandas import DatetimeIndex, TimedeltaIndex, PeriodIndex - if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = np.array(list(values), dtype='object') - if needs_i8_conversion(comps): - if is_period_dtype(values): - comps = PeriodIndex(comps) - values = PeriodIndex(values) - elif is_timedelta64_dtype(comps): - comps = TimedeltaIndex(comps) - values = TimedeltaIndex(values) - else: - comps = DatetimeIndex(comps) - values = DatetimeIndex(values) - - values = values.asi8 - comps = comps.asi8 - elif is_bool_dtype(comps): - - try: - comps = np.asarray(comps).view('uint8') - values = np.asarray(values).view('uint8') - except TypeError: - # object array conversion will fail - pass - elif is_numeric_dtype(comps): - comps = np.asarray(comps) - values = np.asarray(values) - else: - comps = np.asarray(comps).astype(object) - values = np.asarray(values).astype(object) + comps, dtype, _ = _ensure_data(comps) + values, _, _ = _ensure_data(values, dtype=dtype) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 @@ -396,53 +460,32 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ - from pandas import Index, Series, DatetimeIndex, PeriodIndex - - # handling possibilities here - # - for a numpy datetimelike simply view as i8 then cast back - # - bool handled as uint8 then cast back - # - for an extension datetimelike view as i8 then - # reconstruct from boxed values to transfer metadata - dtype = None - if needs_i8_conversion(values): - if is_period_dtype(values): - values = PeriodIndex(values) - vals = values.asi8 - elif is_datetimetz(values): - values = DatetimeIndex(values) - vals = values.asi8 - else: - # numpy dtype - dtype = values.dtype - vals = values.view(np.int64) - elif is_bool_dtype(values): - dtype = bool - vals = np.asarray(values).view('uint8') - else: - vals = np.asarray(values) - (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) + original = values + values, dtype, _ = _ensure_data(values) + (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - table = hash_klass(size_hint or len(vals)) + table = hash_klass(size_hint or len(values)) uniques = vec_klass() - check_nulls = not is_integer_dtype(values) - labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) + check_nulls = not is_integer_dtype(original) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) - uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) - if dtype is not None: - uniques = uniques.astype(dtype) + uniques = _reconstruct_data(uniques, dtype, original) - if isinstance(values, Index): - uniques = values._shallow_copy(uniques, name=None) - elif isinstance(values, Series): + # return original tenor + if isinstance(original, ABCIndexClass): + uniques = original._shallow_copy(uniques, name=None) + elif isinstance(original, ABCSeries): + from pandas import Index uniques = Index(uniques) + return labels, uniques @@ -471,7 +514,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, value_counts : Series """ - from pandas.core.series import Series + from pandas.core.series import Series, Index name = getattr(values, 'name', None) if bins is not None: @@ -483,17 +526,16 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.codes - if is_extension_type(values) and not is_datetimetz(values): + if is_categorical_dtype(values) or is_sparse(values): + # handle Categorical and sparse, - # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) result.name = name counts = result.values + else: - # ndarray path. pass original to handle DatetimeTzBlock - keys, counts = _value_counts_arraylike(values, dropna=dropna) + keys, counts = _value_counts_arraylike(values, dropna) - from pandas import Index, Series if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) @@ -513,60 +555,45 @@ def value_counts(values, sort=True, ascending=False, normalize=False, return result -def _value_counts_arraylike(values, dropna=True): - is_datetimetz_type = is_datetimetz(values) - is_period_type = (is_period_dtype(values) or - is_period_arraylike(values)) - - orig = values - - from pandas.core.series import Series - values = Series(values).values - dtype = values.dtype +def _value_counts_arraylike(values, dropna): + """ + Parameters + ---------- + values : arraylike + dropna : boolean - if needs_i8_conversion(dtype) or is_period_type: + Returns + ------- + (uniques, counts) - from pandas.tseries.index import DatetimeIndex - from pandas.tseries.period import PeriodIndex + """ + values = _ensure_arraylike(values) + original = values + values, dtype, ndtype = _ensure_data(values) - if is_period_type: - # values may be an object - values = PeriodIndex(values) - freq = values.freq + if needs_i8_conversion(dtype): + # i8 - values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] - # convert the keys back to the dtype we came in - keys = keys.astype(dtype) - - # dtype handling - if is_datetimetz_type: - keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) - elif is_period_type: - keys = PeriodIndex._from_ordinals(keys, freq=freq) - - elif is_signed_integer_dtype(dtype): - values = _ensure_int64(values) - keys, counts = htable.value_count_int64(values, dropna) - elif is_unsigned_integer_dtype(dtype): - values = _ensure_uint64(values) - keys, counts = htable.value_count_uint64(values, dropna) - elif is_float_dtype(dtype): - values = _ensure_float64(values) - keys, counts = htable.value_count_float64(values, dropna) else: - values = _ensure_object(values) - keys, counts = htable.value_count_object(values, dropna) + # ndarray like + + # TODO: handle uint8 + f = getattr(htable, "value_count_{dtype}".format(dtype=ndtype)) + keys, counts = f(values, dropna) mask = isnull(values) if not dropna and mask.any(): - keys = np.insert(keys, 0, np.NaN) - counts = np.insert(counts, 0, mask.sum()) + if not isnull(keys).any(): + keys = np.insert(keys, 0, np.NaN) + counts = np.insert(counts, 0, mask.sum()) + + keys = _reconstruct_data(keys, original.dtype, original) return keys, counts @@ -593,33 +620,9 @@ def duplicated(values, keep='first'): duplicated : ndarray """ - dtype = values.dtype - - # no need to revert to original type - if needs_i8_conversion(dtype): - values = values.view(np.int64) - elif is_period_arraylike(values): - from pandas.tseries.period import PeriodIndex - values = PeriodIndex(values).asi8 - elif is_categorical_dtype(dtype): - values = values.values.codes - elif isinstance(values, (ABCSeries, ABCIndex)): - values = values.values - - if is_signed_integer_dtype(dtype): - values = _ensure_int64(values) - duplicated = htable.duplicated_int64(values, keep=keep) - elif is_unsigned_integer_dtype(dtype): - values = _ensure_uint64(values) - duplicated = htable.duplicated_uint64(values, keep=keep) - elif is_float_dtype(dtype): - values = _ensure_float64(values) - duplicated = htable.duplicated_float64(values, keep=keep) - else: - values = _ensure_object(values) - duplicated = htable.duplicated_object(values, keep=keep) - - return duplicated + values, dtype, ndtype = _ensure_data(values) + f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) + return f(values, keep=keep) def mode(values): @@ -635,40 +638,34 @@ def mode(values): ------- mode : Series """ + from pandas import Series - # must sort because hash order isn't necessarily defined. - from pandas.core.series import Series + values = _ensure_arraylike(values) + original = values - if isinstance(values, Series): - constructor = values._constructor - values = values.values - else: - values = np.asanyarray(values) - constructor = Series + # categorical is a fast-path + if is_categorical_dtype(values): - dtype = values.dtype - if is_signed_integer_dtype(values): - values = _ensure_int64(values) - result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) - elif is_unsigned_integer_dtype(values): - values = _ensure_uint64(values) - result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype) - elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): - dtype = values.dtype - values = values.view(np.int64) - result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) - elif is_categorical_dtype(values): - result = constructor(values.mode()) - else: + if isinstance(values, Series): + return Series(values.values.mode()) + return values.mode() + + values, dtype, ndtype = _ensure_data(values) + + # TODO: this should support float64 + if ndtype not in ['int64', 'uint64', 'object']: + ndtype = 'object' values = _ensure_object(values) - res = htable.mode_object(values) - try: - res = np.sort(res) - except TypeError as e: - warn("Unable to sort modes: %s" % e) - result = constructor(res, dtype=dtype) - return result + f = getattr(htable, "mode_{dtype}".format(dtype=ndtype)) + result = f(values) + try: + result = np.sort(result) + except TypeError as e: + warn("Unable to sort modes: %s" % e) + + result = _reconstruct_data(result, original.dtype, original) + return Series(result) def rank(values, axis=0, method='average', na_option='keep', @@ -859,6 +856,12 @@ def quantile(x, q, interpolation_method='fraction'): values = np.sort(x) + def _interpolate(a, b, fraction): + """Returns the point at the given fraction between a and b, where + 'fraction' must be between 0 and 1. + """ + return a + (b - a) * fraction + def _get_score(at): if len(values) == 0: return np.nan @@ -887,261 +890,186 @@ def _get_score(at): return algos.arrmap_float64(q, _get_score) -def _interpolate(a, b, fraction): - """Returns the point at the given fraction between a and b, where - 'fraction' must be between 0 and 1. - """ - return a + (b - a) * fraction - - -def nsmallest(arr, n, keep='first'): - """ - Find the indices of the n smallest values of a numpy array. - - Note: Fails silently with NaN. - """ - if keep == 'last': - arr = arr[::-1] - - narr = len(arr) - n = min(n, narr) - - arr = _ensure_data_view(arr) - kth_val = algos.kth_smallest(arr.copy(), n - 1) - return _finalize_nsmallest(arr, kth_val, n, keep, narr) - +# --------------- # +# select n # +# --------------- # -def nlargest(arr, n, keep='first'): - """ - Find the indices of the n largest values of a numpy array. +class SelectN(object): - Note: Fails silently with NaN. - """ - arr = _ensure_data_view(arr) - return nsmallest(-arr, n, keep=keep) + def __init__(self, obj, n, keep): + self.obj = obj + self.n = n + self.keep = keep + if self.keep not in ('first', 'last'): + raise ValueError('keep must be either "first", "last"') -def select_n_slow(dropped, n, keep, method): - reverse_it = (keep == 'last' or method == 'nlargest') - ascending = method == 'nsmallest' - slc = np.s_[::-1] if reverse_it else np.s_[:] - return dropped[slc].sort_values(ascending=ascending).head(n) + def nlargest(self): + return self.compute('nlargest') + def nsmallest(self): + return self.compute('nsmallest') -_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} + @staticmethod + def is_valid_dtype_n_method(dtype): + """ + Helper function to determine if dtype is valid for + nsmallest/nlargest methods + """ + return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or + needs_i8_conversion(dtype)) -def _is_valid_dtype_n_method(dtype): - """ - Helper function to determine if dtype is valid for - nsmallest/nlargest methods +class SelectNSeries(SelectN): """ - return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or - needs_i8_conversion(dtype)) - - -def select_n_series(series, n, keep, method): - """Implement n largest/smallest for pandas Series + Implement n largest/smallest for Series Parameters ---------- - series : pandas.Series object + obj : Series n : int keep : {'first', 'last'}, default 'first' - method : str, {'nlargest', 'nsmallest'} Returns ------- nordered : Series """ - dtype = series.dtype - if not _is_valid_dtype_n_method(dtype): - raise TypeError("Cannot use method '{method}' with " - "dtype {dtype}".format(method=method, dtype=dtype)) - if keep not in ('first', 'last'): - raise ValueError('keep must be either "first", "last"') + def compute(self, method): + + n = self.n + dtype = self.obj.dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError("Cannot use method '{method}' with " + "dtype {dtype}".format(method=method, + dtype=dtype)) + + if n <= 0: + return self.obj[[]] + + dropped = self.obj.dropna() + + # slow method + if n >= len(self.obj): - if n <= 0: - return series[[]] + reverse_it = (self.keep == 'last' or method == 'nlargest') + ascending = method == 'nsmallest' + slc = np.s_[::-1] if reverse_it else np.s_[:] + return dropped[slc].sort_values(ascending=ascending).head(n) - dropped = series.dropna() + # fast method + arr, _, _ = _ensure_data(dropped.values) + if method == 'nlargest': + arr = -arr - if n >= len(series): - return select_n_slow(dropped, n, keep, method) + if self.keep == 'last': + arr = arr[::-1] - inds = _select_methods[method](dropped.values, n, keep) - return dropped.iloc[inds] + narr = len(arr) + n = min(n, narr) + kth_val = algos.kth_smallest(arr.copy(), n - 1) + ns, = np.nonzero(arr <= kth_val) + inds = ns[arr[ns].argsort(kind='mergesort')][:n] + if self.keep == 'last': + # reverse indices + inds = narr - 1 - inds -def select_n_frame(frame, columns, n, method, keep): - """Implement n largest/smallest for pandas DataFrame + return dropped.iloc[inds] + + +class SelectNFrame(SelectN): + """ + Implement n largest/smallest for DataFrame Parameters ---------- - frame : pandas.DataFrame object - columns : list or str + obj : DataFrame n : int keep : {'first', 'last'}, default 'first' - method : str, {'nlargest', 'nsmallest'} + columns : list or str Returns ------- nordered : DataFrame """ - from pandas import Int64Index - if not is_list_like(columns): - columns = [columns] - columns = list(columns) - for column in columns: - dtype = frame[column].dtype - if not _is_valid_dtype_n_method(dtype): - raise TypeError(( - "Column {column!r} has dtype {dtype}, cannot use method " - "{method!r} with this dtype" - ).format(column=column, dtype=dtype, method=method)) - - def get_indexer(current_indexer, other_indexer): - """Helper function to concat `current_indexer` and `other_indexer` - depending on `method` - """ - if method == 'nsmallest': - return current_indexer.append(other_indexer) - else: - return other_indexer.append(current_indexer) - - # Below we save and reset the index in case index contains duplicates - original_index = frame.index - cur_frame = frame = frame.reset_index(drop=True) - cur_n = n - indexer = Int64Index([]) - - for i, column in enumerate(columns): - - # For each column we apply method to cur_frame[column]. If it is the - # last column in columns, or if the values returned are unique in - # frame[column] we save this index and break - # Otherwise we must save the index of the non duplicated values - # and set the next cur_frame to cur_frame filtered on all duplcicated - # values (#GH15297) - series = cur_frame[column] - values = getattr(series, method)(cur_n, keep=keep) - is_last_column = len(columns) - 1 == i - if is_last_column or values.nunique() == series.isin(values).sum(): - - # Last column in columns or values are unique in series => values - # is all that matters - indexer = get_indexer(indexer, values.index) - break - - duplicated_filter = series.duplicated(keep=False) - duplicated = values[duplicated_filter] - non_duplicated = values[~duplicated_filter] - indexer = get_indexer(indexer, non_duplicated.index) - - # Must set cur frame to include all duplicated values to consider for - # the next column, we also can reduce cur_n by the current length of - # the indexer - cur_frame = cur_frame[series.isin(duplicated)] - cur_n = n - len(indexer) - - frame = frame.take(indexer) - - # Restore the index on frame - frame.index = original_index.take(indexer) - return frame - - -def _finalize_nsmallest(arr, kth_val, n, keep, narr): - ns, = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind='mergesort')][:n] - if keep == 'last': - # reverse indices - return narr - 1 - inds - else: - return inds - - -# ------- # -# helpers # -# ------- # - -def _hashtable_algo(f, values, return_dtype=None): - """ - f(HashTable, type_caster) -> result - """ - - dtype = values.dtype - if is_float_dtype(dtype): - return f(htable.Float64HashTable, _ensure_float64) - elif is_signed_integer_dtype(dtype): - return f(htable.Int64HashTable, _ensure_int64) - elif is_unsigned_integer_dtype(dtype): - return f(htable.UInt64HashTable, _ensure_uint64) - elif is_datetime64_dtype(dtype): - return_dtype = return_dtype or 'M8[ns]' - return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) - elif is_timedelta64_dtype(dtype): - return_dtype = return_dtype or 'm8[ns]' - return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) - - # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: - return f(htable.StringHashTable, _ensure_object) - - # use Object - return f(htable.PyObjectHashTable, _ensure_object) - - -_hashtables = { - 'float64': (htable.Float64HashTable, htable.Float64Vector), - 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), - 'int64': (htable.Int64HashTable, htable.Int64Vector), - 'string': (htable.StringHashTable, htable.ObjectVector), - 'object': (htable.PyObjectHashTable, htable.ObjectVector) -} - - -def _get_data_algo(values, func_map): - - f = None - - if is_categorical_dtype(values): - values = values._values_for_rank() - - if is_float_dtype(values): - f = func_map['float64'] - values = _ensure_float64(values) - - elif needs_i8_conversion(values): - f = func_map['int64'] - values = values.view('i8') - - elif is_signed_integer_dtype(values): - f = func_map['int64'] - values = _ensure_int64(values) - - elif is_unsigned_integer_dtype(values): - f = func_map['uint64'] - values = _ensure_uint64(values) - - else: - values = _ensure_object(values) - - # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: - try: - f = func_map['string'] - except KeyError: - pass - - if f is None: - f = func_map['object'] - - return f, values - -# ---- # + def __init__(self, obj, n, keep, columns): + super(SelectNFrame, self).__init__(obj, n, keep) + if not is_list_like(columns): + columns = [columns] + columns = list(columns) + self.columns = columns + + def compute(self, method): + + from pandas import Int64Index + n = self.n + frame = self.obj + columns = self.columns + + for column in columns: + dtype = frame[column].dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError(( + "Column {column!r} has dtype {dtype}, cannot use method " + "{method!r} with this dtype" + ).format(column=column, dtype=dtype, method=method)) + + def get_indexer(current_indexer, other_indexer): + """Helper function to concat `current_indexer` and `other_indexer` + depending on `method` + """ + if method == 'nsmallest': + return current_indexer.append(other_indexer) + else: + return other_indexer.append(current_indexer) + + # Below we save and reset the index in case index contains duplicates + original_index = frame.index + cur_frame = frame = frame.reset_index(drop=True) + cur_n = n + indexer = Int64Index([]) + + for i, column in enumerate(columns): + + # For each column we apply method to cur_frame[column]. + # If it is the last column in columns, or if the values + # returned are unique in frame[column] we save this index + # and break + # Otherwise we must save the index of the non duplicated values + # and set the next cur_frame to cur_frame filtered on all + # duplcicated values (#GH15297) + series = cur_frame[column] + values = getattr(series, method)(cur_n, keep=self.keep) + is_last_column = len(columns) - 1 == i + if is_last_column or values.nunique() == series.isin(values).sum(): + + # Last column in columns or values are unique in + # series => values + # is all that matters + indexer = get_indexer(indexer, values.index) + break + + duplicated_filter = series.duplicated(keep=False) + duplicated = values[duplicated_filter] + non_duplicated = values[~duplicated_filter] + indexer = get_indexer(indexer, non_duplicated.index) + + # Must set cur frame to include all duplicated values + # to consider for the next column, we also can reduce + # cur_n by the current length of the indexer + cur_frame = cur_frame[series.isin(duplicated)] + cur_n = n - len(indexer) + + frame = frame.take(indexer) + + # Restore the index on frame + frame.index = original_index.take(indexer) + return frame + + +# ------- ## ---- # # take # # ---- # @@ -1534,23 +1462,41 @@ def func(arr, indexer, out, fill_value=np.nan): def diff(arr, n, axis=0): - """ difference of n between self, - analagoust to s-s.shift(n) """ + """ + difference of n between self, + analagoust to s-s.shift(n) + + Parameters + ---------- + arr : ndarray + n : int + number of periods + axis : int + axis to shift on + + Returns + ------- + shifted + + """ n = int(n) na = np.nan dtype = arr.dtype + is_timedelta = False if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view('i8') na = iNaT is_timedelta = True - elif issubclass(dtype.type, np.integer): - dtype = np.float64 - elif issubclass(dtype.type, np.bool_): + + elif is_bool_dtype(dtype): dtype = np.object_ + elif is_integer_dtype(dtype): + dtype = np.float64 + dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3980bf6cdbc09..f6199be2d1fc9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3441,7 +3441,10 @@ def nlargest(self, n, columns, keep='first'): 1 10 b 2 2 8 d NaN """ - return algorithms.select_n_frame(self, columns, n, 'nlargest', keep) + return algorithms.SelectNFrame(self, + n=n, + keep=keep, + columns=columns).nlargest() def nsmallest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` smallest @@ -3475,7 +3478,10 @@ def nsmallest(self, n, columns, keep='first'): 0 1 a 1 2 8 d NaN """ - return algorithms.select_n_frame(self, columns, n, 'nsmallest', keep) + return algorithms.SelectNFrame(self, + n=n, + keep=keep, + columns=columns).nsmallest() def swaplevel(self, i=-2, j=-1, axis=0): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 1aaa106d2c68f..d6a1a9d98faf4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1856,8 +1856,7 @@ def nlargest(self, n=5, keep='first'): 121637 4.240952 dtype: float64 """ - return algorithms.select_n_series(self, n=n, keep=keep, - method='nlargest') + return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() def nsmallest(self, n=5, keep='first'): """Return the smallest `n` elements. @@ -1903,8 +1902,7 @@ def nsmallest(self, n=5, keep='first'): 359919 -4.331927 dtype: float64 """ - return algorithms.select_n_series(self, n=n, keep=keep, - method='nsmallest') + return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ac3a42c3cf122..d893183dae0ed 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -620,9 +620,9 @@ def test_dropna(self): # 32-bit linux has a different ordering if not compat.is_platform_32bit(): - tm.assert_series_equal( - pd.Series([10.3, 5., 5., None]).value_counts(dropna=False), - pd.Series([2, 1, 1], index=[5., 10.3, np.nan])) + result = pd.Series([10.3, 5., 5., None]).value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[5., 10.3, np.nan]) + tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): # GH12558 @@ -1356,16 +1356,19 @@ def test_uint64_overflow(self): def test_categorical(self): c = Categorical([1, 2]) - exp = Series([1, 2], dtype=np.int64) - tm.assert_series_equal(algos.mode(c), exp) + exp = c + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) c = Categorical([1, 'a', 'a']) - exp = Series(['a'], dtype=object) - tm.assert_series_equal(algos.mode(c), exp) + exp = Categorical(['a'], categories=[1, 'a']) + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) c = Categorical([1, 1, 2, 3, 3]) - exp = Series([1, 3], dtype=np.int64) - tm.assert_series_equal(algos.mode(c), exp) + exp = Categorical([1, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) def test_index(self): idx = Index([1, 2, 3]) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index 8ef2868ae324f..e7b2edeb57714 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -149,6 +149,7 @@ def test_construction_from_string(self): lambda: DatetimeTZDtype.construct_from_string('foo')) def test_is_dtype(self): + self.assertFalse(DatetimeTZDtype.is_dtype(None)) self.assertTrue(DatetimeTZDtype.is_dtype(self.dtype)) self.assertTrue(DatetimeTZDtype.is_dtype('datetime64[ns, US/Eastern]')) self.assertFalse(DatetimeTZDtype.is_dtype('foo')) diff --git a/pandas/types/common.py b/pandas/types/common.py index a1f03e59a5e6e..017805673defe 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -359,6 +359,8 @@ def _coerce_to_dtype(dtype): def _get_dtype(arr_or_dtype): + if arr_or_dtype is None: + raise TypeError if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype elif isinstance(arr_or_dtype, type): diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 43135ba94ab46..c3494df93476b 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -82,6 +82,8 @@ def is_dtype(cls, dtype): return True elif isinstance(dtype, np.dtype): return False + elif dtype is None: + return False try: return cls.construct_from_string(dtype) is not None except: