diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7920f05b5e7a1..8e5b111527d2d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -221,8 +221,8 @@ def sort_mixed(values): ordered = sort_mixed(values) else: try: - sorter = values.argsort() - ordered = values.take(sorter) + sorter = _ensure_int64(values.argsort()) + ordered = take_nd(values, sorter, allow_fill=False) except TypeError: # try this anyway ordered = sort_mixed(values) @@ -235,7 +235,7 @@ def sort_mixed(values): if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") - labels = _ensure_platform_int(np.asarray(labels)) + labels = np.asarray(labels) from pandas import Index if not assume_unique and not Index(values).is_unique: @@ -246,18 +246,16 @@ def sort_mixed(values): (hash_klass, _), values = _get_data_algo(values, _hashtables) t = hash_klass(len(values)) t.map_locations(values) - sorter = _ensure_platform_int(t.lookup(ordered)) + sorter = t.lookup(ordered) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = (labels < -len(values)) | (labels >= len(values)) | \ (labels == na_sentinel) + np.putmask(labels, mask, -1) - # (Out of bound indices will be masked with `na_sentinel` next, so we may - # deal with them here without performance loss using `mode='wrap'`.) - new_labels = reverse_indexer.take(labels, mode='wrap') - np.putmask(new_labels, mask, na_sentinel) + new_labels = take_nd(reverse_indexer, labels, fill_value=na_sentinel) return ordered, new_labels @@ -304,8 +302,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) - labels = _ensure_platform_int(labels) - uniques = uniques.to_array() if sort and len(uniques) > 0: @@ -825,6 +821,7 @@ def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info): out[i, j] = arr[u_, v] +# is this used ? def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info @@ -1076,7 +1073,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) - indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4416213817ab4..48880f6b81cbd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -48,7 +48,6 @@ _ensure_float, _ensure_float64, _ensure_int64, - _ensure_platform_int, is_list_like, is_iterator, is_sequence, @@ -3195,7 +3194,6 @@ def trans(v): keys.append(trans(k)) indexer = _lexsort_indexer(keys, orders=ascending, na_position=na_position) - indexer = _ensure_platform_int(indexer) else: from pandas.core.groupby import _nargsort diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c2ab406e1da65..caaba7d0760dd 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1706,7 +1706,6 @@ def get_group_levels(self): name_list = [] for ping, labels in zip(self.groupings, self.recons_labels): - labels = _ensure_platform_int(labels) levels = ping.group_index.take(labels) name_list.append(levels) @@ -4368,7 +4367,7 @@ def _get_group_index_sorter(group_index, ngroups): if alpha + beta * ngroups < count * np.log(count): sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), ngroups) - return _ensure_platform_int(sorter) + return sorter else: return group_index.argsort(kind='mergesort') diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 933ecd1b8de86..5efc002dfcfd2 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -10,7 +10,7 @@ is_list_like, is_sequence, is_scalar, - _ensure_platform_int) + _ensure_int64) from pandas.types.missing import isnull, _infer_fill_value from pandas.core.index import Index, MultiIndex @@ -864,7 +864,6 @@ def _convert_for_reindex(self, key, axis=0): keyarr = _asarray_tuplesafe(key) if is_integer_dtype(keyarr) and not labels.is_integer(): - keyarr = _ensure_platform_int(keyarr) return labels.take(keyarr) return keyarr @@ -1853,20 +1852,12 @@ def maybe_convert_indices(indices, n): """ if we have negative indicies, translate to postive here if have indicies that are out-of-bounds, raise an IndexError """ - if isinstance(indices, list): - indices = np.array(indices) - if len(indices) == 0: - # If list is empty, np.array will return float and cause indexing - # errors. - return np.empty(0, dtype=np.int_) - - mask = indices < 0 - if mask.any(): - indices[mask] += n - mask = (indices >= n) | (indices < 0) - if mask.any(): - raise IndexError("indices are out-of-bounds") - return indices + # return indices + from pandas.algos import take_bounds_check + indices = _ensure_int64(indices) + out = np.empty(len(indices), dtype='int64') + take_bounds_check(indices, out, n) + return out def maybe_convert_ix(*args): diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 4f601a2d377a6..bc1c645548bff 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -6,7 +6,7 @@ import numpy as np -from pandas.types.common import _ensure_platform_int, is_list_like +from pandas.types.common import is_list_like from pandas.types.cast import _maybe_promote from pandas.types.missing import notnull import pandas.types.concat as _concat @@ -114,10 +114,10 @@ def _make_sorted_values_labels(self): ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] - indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) - self.sorted_labels = [l.take(indexer) for l in to_sort] + self.sorted_labels = [algos.take_nd(l, indexer, allow_fill=False) + for l in to_sort] def _make_selectors(self): new_levels = self.new_index_levels @@ -129,7 +129,6 @@ def _make_selectors(self): comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) - comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride diff --git a/pandas/core/series.py b/pandas/core/series.py index e388683012a66..ae6a940453bbd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1768,7 +1768,6 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.groupby import _lexsort_indexer indexer = _lexsort_indexer(index.labels, orders=ascending) - indexer = _ensure_platform_int(indexer) new_index = index.take(indexer) else: new_index, indexer = index.sort_values(return_indexer=True, @@ -2381,14 +2380,14 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): numpy.ndarray.take """ nv.validate_take(tuple(), kwargs) + indices = np.asarray(indices) # check/convert indicies here if convert: indices = maybe_convert_indices(indices, len(self._get_axis(axis))) - indices = _ensure_platform_int(indices) - new_index = self.index.take(indices) - new_values = self._values.take(indices) + new_index = self.index.take(indices, convert=False) + new_values = algos.take_nd(self._values, indices, allow_fill=False) return self._constructor(new_values, index=new_index).__finalize__(self) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index de7780d25b1e5..4023bad634746 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1463,38 +1463,39 @@ def _ensure_compat_concat(indexes): @Appender(_index_shared_docs['take']) def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + fill_value=None, convert=True, **kwargs): nv.validate_take(tuple(), kwargs) - indices = _ensure_platform_int(indices) - if self._can_hold_na: + + if not self._can_hold_na and allow_fill and fill_value is not None: + msg = 'Unable to fill values because {0} cannot contain NA' + raise ValueError(msg.format(self.__class__.__name__)) + else: taken = self._assert_take_fillable(self.values, indices, allow_fill=allow_fill, fill_value=fill_value, - na_value=self._na_value) - else: - if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' - raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) + na_value=self._na_value, + convert=convert) return self._shallow_copy(taken) def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): - """ Internal method to handle NA filling of take """ - indices = _ensure_platform_int(indices) - - # only fill if we are passing a non-None fill_value + fill_value=None, convert=True, na_value=np.nan): + """ internal method to handle NA filling of take """ + indices = np.asarray(indices) if allow_fill and fill_value is not None: if (indices < -1).any(): msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) - taken = values.take(indices) - mask = indices == -1 - if mask.any(): - taken[mask] = na_value + else: + taken = algos.take_nd(values, indices, allow_fill=allow_fill, + fill_value=na_value) else: - taken = values.take(indices) + # provide wraparound semantics if fill_value not specified + if convert: + from pandas.core.indexing import maybe_convert_indices + n = values.shape[0] + indices = maybe_convert_indices(indices, n) + taken = algos.take_nd(values, indices, allow_fill=False) return taken @cache_readonly @@ -2529,7 +2530,7 @@ def _reindex_non_unique(self, target): if len(missing): l = np.arange(len(indexer)) - missing = _ensure_platform_int(missing) + missing = missing missing_labels = target.take(missing) missing_indexer = _ensure_int64(l[~check]) cur_labels = self.take(indexer[check])._values @@ -2723,12 +2724,9 @@ def _join_non_unique(self, other, how='left', return_indexers=False): [other._values], how=how, sort=True) - left_idx = _ensure_platform_int(left_idx) - right_idx = _ensure_platform_int(right_idx) - - join_index = self.values.take(left_idx) - mask = left_idx == -1 - np.putmask(join_index, mask, other._values.take(right_idx)) + lvals = algos.take_nd(self.values, left_idx, fill_value=-1) + rvals = algos.take_nd(other._values, right_idx, fill_value=-1) + join_index = np.where(left_idx == -1, rvals, lvals) join_index = self._wrap_joined_index(join_index, other) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index f1d4fe2f26bdd..6b05aabf9df42 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -5,7 +5,6 @@ from pandas.compat.numpy import function as nv from pandas.types.generic import ABCCategorical, ABCSeries from pandas.types.common import (is_categorical_dtype, - _ensure_platform_int, is_list_like, is_scalar) from pandas.types.missing import array_equivalent @@ -466,7 +465,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) - return _ensure_platform_int(indexer) + return indexer def get_indexer_non_unique(self, target): """ this is the same for a CategoricalIndex for get_indexer; the API @@ -497,7 +496,6 @@ def _convert_list_indexer(self, keyarr, kind=None): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 95ef18d23a037..2f4180a8f0aa3 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1038,7 +1038,6 @@ def __getitem__(self, key): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.labels, indices, allow_fill=allow_fill, fill_value=fill_value, @@ -1055,17 +1054,16 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) - taken = [lab.take(indices) for lab in self.labels] - mask = indices == -1 - if mask.any(): - masked = [] - for new_label in taken: - label_values = new_label.values() - label_values[mask] = na_value - masked.append(base.FrozenNDArray(label_values)) - taken = masked + taken = [algos.take_nd(lab, indices, fill_value=na_value) + for lab in values] else: - taken = [lab.take(indices) for lab in self.labels] + # provide wraparound semantics + from pandas.core.indexing import maybe_convert_indices + taken = [] + for i, lab in enumerate(values): + lab = maybe_convert_indices(lab, len(self.levels[i])) + taken.append(algos.take_nd(lab, indices, allow_fill=False)) + return taken def append(self, other): @@ -1340,7 +1338,6 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): if not ascending: indexer = indexer[::-1] - indexer = _ensure_platform_int(indexer) new_labels = [lab.take(indexer) for lab in self.labels] new_index = MultiIndex(labels=new_labels, levels=self.levels, @@ -1786,7 +1783,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # selected from pandas import Series mapper = Series(indexer) - indexer = labels.take(_ensure_platform_int(indexer)) + indexer = labels.take(indexer) result = Series(Index(indexer).isin(r).nonzero()[0]) m = result.map(mapper)._values diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi index 59b3ddff46dec..5957d56bed298 100644 --- a/pandas/src/algos_common_helper.pxi +++ b/pandas/src/algos_common_helper.pxi @@ -2847,17 +2847,16 @@ def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values, #---------------------------------------------------------------------- # ensure_dtype #---------------------------------------------------------------------- - -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num cpdef ensure_platform_int(object arr): if util.is_array(arr): if ( arr).descr.type_num == PLATFORM_INT: return arr else: - return arr.astype(np.int_) + return arr.astype(np.intp) else: - return np.array(arr, dtype=np.int_) + return np.array(arr, dtype=np.intp) cpdef ensure_object(object arr): if util.is_array(arr): diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 2327f10389cb5..56b327760a951 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -547,17 +547,16 @@ def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values, #---------------------------------------------------------------------- # ensure_dtype #---------------------------------------------------------------------- - -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num cpdef ensure_platform_int(object arr): if util.is_array(arr): if ( arr).descr.type_num == PLATFORM_INT: return arr else: - return arr.astype(np.int_) + return arr.astype(np.intp) else: - return np.array(arr, dtype=np.int_) + return np.array(arr, dtype=np.intp) cpdef ensure_object(object arr): if util.is_array(arr): @@ -600,4 +599,4 @@ cpdef ensure_{{name}}(object arr): else: return np.array(arr, dtype=np.{{dtype}}) -{{endfor}} \ No newline at end of file +{{endfor}} diff --git a/pandas/src/algos_take_helper.pxi b/pandas/src/algos_take_helper.pxi index d8fb05804d4e5..6f9c5efdeffbf 100644 --- a/pandas/src/algos_take_helper.pxi +++ b/pandas/src/algos_take_helper.pxi @@ -4947,3 +4947,22 @@ def take_2d_multi_object_object(ndarray[object, ndim=2] values, out[i, j] = fv else: out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_bounds_check(int64_t[:] indexer, int64_t[:] out, int64_t n): + cdef: + Py_ssize_t i + int64_t label + + with nogil: + for i in range(indexer.shape[0]): + label = indexer[i] + if label < 0: + label += n + + if label >= n or label < 0: + with gil: + raise IndexError("indicies are out of bounds") + out[i] = label diff --git a/pandas/src/algos_take_helper.pxi.in b/pandas/src/algos_take_helper.pxi.in index e9abbcd13f499..842c68c3ba58b 100644 --- a/pandas/src/algos_take_helper.pxi.in +++ b/pandas/src/algos_take_helper.pxi.in @@ -258,4 +258,23 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, else: out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} -{{endfor}} \ No newline at end of file +{{endfor}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_bounds_check(int64_t[:] indexer, int64_t[:] out, int64_t n): + cdef: + Py_ssize_t i + int64_t label + + with nogil: + for i in range(indexer.shape[0]): + label = indexer[i] + if label < 0: + label += n + + if label >= n or label < 0: + with gil: + raise IndexError("indicies are out of bounds") + out[i] = label diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index f3c7577ef528a..87733c33d12e4 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -32,7 +32,8 @@ float64 = np.dtype(np.float64) cdef double NaN = np.NaN cdef double nan = NaN -from pandas.algos import groupsort_indexer +from pandas.algos import groupsort_indexer, ensure_platform_int +from pandas.core.algorithms import take_nd def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, @@ -146,18 +147,14 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, # no multiple matches for any row on the left # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case - if left_sorter.dtype != np.int_: - left_sorter = left_sorter.astype(np.int_) - - rev = np.empty(len(left), dtype=np.int_) + rev = np.empty(len(left), dtype=np.int64) rev.put(left_sorter, np.arange(len(left))) else: rev, _ = groupsort_indexer(left_indexer, len(left)) - if rev.dtype != np.int_: - rev = rev.astype(np.int_) - right_indexer = right_indexer.take(rev) - left_indexer = left_indexer.take(rev) + rev = ensure_platform_int(rev) + right_indexer = take_nd(right_indexer, rev, allow_fill=False) + left_indexer = take_nd(left_indexer, rev, allow_fill=False) return left_indexer, right_indexer @@ -284,11 +281,8 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, def _get_result_indexer(sorter, indexer): - if indexer.dtype != np.int_: - indexer = indexer.astype(np.int_) if len(sorter) > 0: - res = sorter.take(indexer) - np.putmask(res, indexer == -1, -1) + res = take_nd(sorter, indexer, fill_value=-1) else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 571df70e05c6d..ba35ea2840db7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -23,7 +23,6 @@ is_bool, is_list_like, _ensure_int64, - _ensure_platform_int, _ensure_object) from pandas.types.missing import na_value_for_dtype @@ -434,12 +433,11 @@ def _merger(x, y): # if we DO have duplicates, then # we cannot guarantee order - sorter = _ensure_platform_int( - np.concatenate([groupby.indices[g] for g, _ in groupby])) + sorter = np.concatenate([groupby.indices[g] for g, _ in groupby]) if len(result) != len(sorter): return result - rev = np.empty(len(sorter), dtype=np.int_) + rev = np.empty(len(sorter), dtype=np.int64) rev.put(sorter, np.arange(len(sorter))) return result.take(rev).reset_index(drop=True) @@ -599,7 +597,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if name in self.left: if left_has_missing is None: - left_has_missing = any(left_indexer == -1) + left_has_missing = (left_indexer == -1).any() if left_has_missing: take_right = self.right_join_keys[i] @@ -611,7 +609,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): elif name in self.right: if right_has_missing is None: - right_has_missing = any(right_indexer == -1) + right_has_missing = (right_indexer == -1).any() if right_has_missing: take_left = self.left_join_keys[i] diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 188f538372092..3e442dbaab524 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -332,7 +332,7 @@ def sort_values(self, return_indexer=False, ascending=True): @Appender(_index_shared_docs['take']) def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + fill_value=None, convert=True, **kwargs): nv.validate_take(tuple(), kwargs) indices = _ensure_int64(indices) @@ -343,7 +343,8 @@ def take(self, indices, axis=0, allow_fill=True, taken = self._assert_take_fillable(self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value, - na_value=tslib.iNaT) + na_value=tslib.iNaT, + convert=convert) # keep freq in PeriodIndex, reset otherwise freq = self.freq if isinstance(self, ABCPeriodIndex) else None