Skip to content

CLN/PERF: remove ndarray.take and platform int conversions #13924

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 7 additions & 11 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,8 @@ def sort_mixed(values):
ordered = sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
sorter = _ensure_int64(values.argsort())
ordered = take_nd(values, sorter, allow_fill=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would do as many ensures inside take_nd that u can - so we can just call it with anything and it will work

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree - take_nd already does this check - so I didn't need this

except TypeError:
# try this anyway
ordered = sort_mixed(values)
Expand All @@ -235,7 +235,7 @@ def sort_mixed(values):
if not is_list_like(labels):
raise TypeError("Only list-like objects or None are allowed to be"
"passed to safe_sort as labels")
labels = _ensure_platform_int(np.asarray(labels))
labels = np.asarray(labels)

from pandas import Index
if not assume_unique and not Index(values).is_unique:
Expand All @@ -246,18 +246,16 @@ def sort_mixed(values):
(hash_klass, _), values = _get_data_algo(values, _hashtables)
t = hash_klass(len(values))
t.map_locations(values)
sorter = _ensure_platform_int(t.lookup(ordered))
sorter = t.lookup(ordered)

reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))

mask = (labels < -len(values)) | (labels >= len(values)) | \
(labels == na_sentinel)
np.putmask(labels, mask, -1)

# (Out of bound indices will be masked with `na_sentinel` next, so we may
# deal with them here without performance loss using `mode='wrap'`.)
new_labels = reverse_indexer.take(labels, mode='wrap')
np.putmask(new_labels, mask, na_sentinel)
new_labels = take_nd(reverse_indexer, labels, fill_value=na_sentinel)

return ordered, new_labels

Expand Down Expand Up @@ -304,8 +302,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
uniques = vec_klass()
labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

labels = _ensure_platform_int(labels)

uniques = uniques.to_array()

if sort and len(uniques) > 0:
Expand Down Expand Up @@ -825,6 +821,7 @@ def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info):
out[i, j] = arr[u_, v]


# is this used ?
def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info):
if mask_info is not None:
mask, needs_masking = mask_info
Expand Down Expand Up @@ -1076,7 +1073,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,

func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
mask_info=mask_info)
indexer = _ensure_int64(indexer)
func(arr, indexer, out, fill_value)

if flip_order:
Expand Down
2 changes: 0 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
_ensure_float,
_ensure_float64,
_ensure_int64,
_ensure_platform_int,
is_list_like,
is_iterator,
is_sequence,
Expand Down Expand Up @@ -3195,7 +3194,6 @@ def trans(v):
keys.append(trans(k))
indexer = _lexsort_indexer(keys, orders=ascending,
na_position=na_position)
indexer = _ensure_platform_int(indexer)
else:
from pandas.core.groupby import _nargsort

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1706,7 +1706,6 @@ def get_group_levels(self):

name_list = []
for ping, labels in zip(self.groupings, self.recons_labels):
labels = _ensure_platform_int(labels)
levels = ping.group_index.take(labels)

name_list.append(levels)
Expand Down Expand Up @@ -4368,7 +4367,7 @@ def _get_group_index_sorter(group_index, ngroups):
if alpha + beta * ngroups < count * np.log(count):
sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index),
ngroups)
return _ensure_platform_int(sorter)
return sorter
else:
return group_index.argsort(kind='mergesort')

Expand Down
23 changes: 7 additions & 16 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
is_list_like,
is_sequence,
is_scalar,
_ensure_platform_int)
_ensure_int64)
from pandas.types.missing import isnull, _infer_fill_value

from pandas.core.index import Index, MultiIndex
Expand Down Expand Up @@ -864,7 +864,6 @@ def _convert_for_reindex(self, key, axis=0):
keyarr = _asarray_tuplesafe(key)

if is_integer_dtype(keyarr) and not labels.is_integer():
keyarr = _ensure_platform_int(keyarr)
return labels.take(keyarr)

return keyarr
Expand Down Expand Up @@ -1853,20 +1852,12 @@ def maybe_convert_indices(indices, n):
""" if we have negative indicies, translate to postive here
if have indicies that are out-of-bounds, raise an IndexError
"""
if isinstance(indices, list):
indices = np.array(indices)
if len(indices) == 0:
# If list is empty, np.array will return float and cause indexing
# errors.
return np.empty(0, dtype=np.int_)

mask = indices < 0
if mask.any():
indices[mask] += n
mask = (indices >= n) | (indices < 0)
if mask.any():
raise IndexError("indices are out-of-bounds")
return indices
# return indices
from pandas.algos import take_bounds_check
indices = _ensure_int64(indices)
out = np.empty(len(indices), dtype='int64')
take_bounds_check(indices, out, n)
return out


def maybe_convert_ix(*args):
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import numpy as np

from pandas.types.common import _ensure_platform_int, is_list_like
from pandas.types.common import is_list_like
from pandas.types.cast import _maybe_promote
from pandas.types.missing import notnull
import pandas.types.concat as _concat
Expand Down Expand Up @@ -114,10 +114,10 @@ def _make_sorted_values_labels(self):
ngroups = len(obs_ids)

indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
indexer = _ensure_platform_int(indexer)

self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
self.sorted_labels = [l.take(indexer) for l in to_sort]
self.sorted_labels = [algos.take_nd(l, indexer, allow_fill=False)
for l in to_sort]

def _make_selectors(self):
new_levels = self.new_index_levels
Expand All @@ -129,7 +129,6 @@ def _make_selectors(self):
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
ngroups = len(obs_ids)

comp_index = _ensure_platform_int(comp_index)
stride = self.index.levshape[self.level] + self.lift
self.full_shape = ngroups, stride

Expand Down
7 changes: 3 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1768,7 +1768,6 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
elif isinstance(index, MultiIndex):
from pandas.core.groupby import _lexsort_indexer
indexer = _lexsort_indexer(index.labels, orders=ascending)
indexer = _ensure_platform_int(indexer)
new_index = index.take(indexer)
else:
new_index, indexer = index.sort_values(return_indexer=True,
Expand Down Expand Up @@ -2381,14 +2380,14 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs):
numpy.ndarray.take
"""
nv.validate_take(tuple(), kwargs)
indices = np.asarray(indices)

# check/convert indicies here
if convert:
indices = maybe_convert_indices(indices, len(self._get_axis(axis)))

indices = _ensure_platform_int(indices)
new_index = self.index.take(indices)
new_values = self._values.take(indices)
new_index = self.index.take(indices, convert=False)
new_values = algos.take_nd(self._values, indices, allow_fill=False)
return self._constructor(new_values,
index=new_index).__finalize__(self)

Expand Down
50 changes: 24 additions & 26 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1463,38 +1463,39 @@ def _ensure_compat_concat(indexes):

@Appender(_index_shared_docs['take'])
def take(self, indices, axis=0, allow_fill=True,
fill_value=None, **kwargs):
fill_value=None, convert=True, **kwargs):
nv.validate_take(tuple(), kwargs)
indices = _ensure_platform_int(indices)
if self._can_hold_na:

if not self._can_hold_na and allow_fill and fill_value is not None:
msg = 'Unable to fill values because {0} cannot contain NA'
raise ValueError(msg.format(self.__class__.__name__))
else:
taken = self._assert_take_fillable(self.values, indices,
allow_fill=allow_fill,
fill_value=fill_value,
na_value=self._na_value)
else:
if allow_fill and fill_value is not None:
msg = 'Unable to fill values because {0} cannot contain NA'
raise ValueError(msg.format(self.__class__.__name__))
taken = self.values.take(indices)
na_value=self._na_value,
convert=convert)
return self._shallow_copy(taken)

def _assert_take_fillable(self, values, indices, allow_fill=True,
fill_value=None, na_value=np.nan):
""" Internal method to handle NA filling of take """
indices = _ensure_platform_int(indices)

# only fill if we are passing a non-None fill_value
fill_value=None, convert=True, na_value=np.nan):
""" internal method to handle NA filling of take """
indices = np.asarray(indices)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document new arg

if allow_fill and fill_value is not None:
if (indices < -1).any():
msg = ('When allow_fill=True and fill_value is not None, '
'all indices must be >= -1')
raise ValueError(msg)
taken = values.take(indices)
mask = indices == -1
if mask.any():
taken[mask] = na_value
else:
taken = algos.take_nd(values, indices, allow_fill=allow_fill,
fill_value=na_value)
else:
taken = values.take(indices)
# provide wraparound semantics if fill_value not specified
if convert:
from pandas.core.indexing import maybe_convert_indices
n = values.shape[0]
indices = maybe_convert_indices(indices, n)
taken = algos.take_nd(values, indices, allow_fill=False)
return taken

@cache_readonly
Expand Down Expand Up @@ -2529,7 +2530,7 @@ def _reindex_non_unique(self, target):
if len(missing):
l = np.arange(len(indexer))

missing = _ensure_platform_int(missing)
missing = missing
missing_labels = target.take(missing)
missing_indexer = _ensure_int64(l[~check])
cur_labels = self.take(indexer[check])._values
Expand Down Expand Up @@ -2723,12 +2724,9 @@ def _join_non_unique(self, other, how='left', return_indexers=False):
[other._values], how=how,
sort=True)

left_idx = _ensure_platform_int(left_idx)
right_idx = _ensure_platform_int(right_idx)

join_index = self.values.take(left_idx)
mask = left_idx == -1
np.putmask(join_index, mask, other._values.take(right_idx))
lvals = algos.take_nd(self.values, left_idx, fill_value=-1)
rvals = algos.take_nd(other._values, right_idx, fill_value=-1)
join_index = np.where(left_idx == -1, rvals, lvals)

join_index = self._wrap_joined_index(join_index, other)

Expand Down
4 changes: 1 addition & 3 deletions pandas/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pandas.compat.numpy import function as nv
from pandas.types.generic import ABCCategorical, ABCSeries
from pandas.types.common import (is_categorical_dtype,
_ensure_platform_int,
is_list_like,
is_scalar)
from pandas.types.missing import array_equivalent
Expand Down Expand Up @@ -466,7 +465,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
codes = self.categories.get_indexer(target)
indexer, _ = self._engine.get_indexer_non_unique(codes)

return _ensure_platform_int(indexer)
return indexer

def get_indexer_non_unique(self, target):
""" this is the same for a CategoricalIndex for get_indexer; the API
Expand Down Expand Up @@ -497,7 +496,6 @@ def _convert_list_indexer(self, keyarr, kind=None):
def take(self, indices, axis=0, allow_fill=True,
fill_value=None, **kwargs):
nv.validate_take(tuple(), kwargs)
indices = _ensure_platform_int(indices)
taken = self._assert_take_fillable(self.codes, indices,
allow_fill=allow_fill,
fill_value=fill_value,
Expand Down
23 changes: 10 additions & 13 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,7 +1038,6 @@ def __getitem__(self, key):
def take(self, indices, axis=0, allow_fill=True,
fill_value=None, **kwargs):
nv.validate_take(tuple(), kwargs)
indices = _ensure_platform_int(indices)
taken = self._assert_take_fillable(self.labels, indices,
allow_fill=allow_fill,
fill_value=fill_value,
Expand All @@ -1055,17 +1054,16 @@ def _assert_take_fillable(self, values, indices, allow_fill=True,
msg = ('When allow_fill=True and fill_value is not None, '
'all indices must be >= -1')
raise ValueError(msg)
taken = [lab.take(indices) for lab in self.labels]
mask = indices == -1
if mask.any():
masked = []
for new_label in taken:
label_values = new_label.values()
label_values[mask] = na_value
masked.append(base.FrozenNDArray(label_values))
taken = masked
taken = [algos.take_nd(lab, indices, fill_value=na_value)
for lab in values]
else:
taken = [lab.take(indices) for lab in self.labels]
# provide wraparound semantics
from pandas.core.indexing import maybe_convert_indices
taken = []
for i, lab in enumerate(values):
lab = maybe_convert_indices(lab, len(self.levels[i]))
taken.append(algos.take_nd(lab, indices, allow_fill=False))

return taken

def append(self, other):
Expand Down Expand Up @@ -1340,7 +1338,6 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True):
if not ascending:
indexer = indexer[::-1]

indexer = _ensure_platform_int(indexer)
new_labels = [lab.take(indexer) for lab in self.labels]

new_index = MultiIndex(labels=new_labels, levels=self.levels,
Expand Down Expand Up @@ -1786,7 +1783,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels):
# selected
from pandas import Series
mapper = Series(indexer)
indexer = labels.take(_ensure_platform_int(indexer))
indexer = labels.take(indexer)
result = Series(Index(indexer).isin(r).nonzero()[0])
m = result.map(mapper)._values

Expand Down
7 changes: 3 additions & 4 deletions pandas/src/algos_common_helper.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2847,17 +2847,16 @@ def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values,
#----------------------------------------------------------------------
# ensure_dtype
#----------------------------------------------------------------------

cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num

cpdef ensure_platform_int(object arr):
if util.is_array(arr):
if (<ndarray> arr).descr.type_num == PLATFORM_INT:
return arr
else:
return arr.astype(np.int_)
return arr.astype(np.intp)
else:
return np.array(arr, dtype=np.int_)
return np.array(arr, dtype=np.intp)

cpdef ensure_object(object arr):
if util.is_array(arr):
Expand Down
Loading