Skip to content

PERF/COMPAT: define platform int to np.intp #13972

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,42 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`)
- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`)

Indexer dtype Changes
^^^^^^^^^^^^^^^^^^^^^
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the issue ref here.


.. note::

This change only affects 64 bit python running on Windows, and only affects relatively advanced
indexing operations

Methods such as ``Index.get_indexer`` that return an indexer array coerce that array to a "platform int", so that it can be
directly used in 3rd party library operations like ``numpy.take``. Previously, a platform int was defined as ``np.int_``
which corresponds to a C integer, but the correct type, and what is being used now, is ``np.intp``, which corresponds
to the C integer size that can hold a pointer. (:issue:`13972`)

These types are the same on many platform, but for 64 bit python on Windows,
``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many
operations on that platform.

Previous behaviour:

.. code-block:: ipython

In [1]: i = pd.Index(['a', 'b', 'c'])

In [2]: i.get_indexer(['b', 'b', 'c']).dtype
Out[2]: dtype('int32')

New behaviour:

.. code-block:: ipython

In [1]: i = pd.Index(['a', 'b', 'c'])

In [2]: i.get_indexer(['b', 'b', 'c']).dtype
Out[2]: dtype('int64')


.. _whatsnew_0190.deprecations:

Deprecations
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def sort_mixed(values):
new_labels = reverse_indexer.take(labels, mode='wrap')
np.putmask(new_labels, mask, na_sentinel)

return ordered, new_labels
return ordered, _ensure_platform_int(new_labels)


def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
Expand Down
12 changes: 6 additions & 6 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ cdef class Factorizer:
mask = (labels == na_sentinel)
# sort on
if sort:
if labels.dtype != np.int_:
labels = labels.astype(np.int_)
if labels.dtype != np.intp:
labels = labels.astype(np.intp)
sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
labels = reverse_indexer.take(labels, mode='clip')
labels[mask] = na_sentinel
Expand Down Expand Up @@ -100,11 +100,11 @@ cdef class Int64Factorizer:

# sort on
if sort:
if labels.dtype != np.int_:
labels = labels.astype(np.int_)
if labels.dtype != np.intp:
labels = labels.astype(np.intp)

sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))

labels = reverse_indexer.take(labels)
Expand Down
8 changes: 7 additions & 1 deletion pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2816,7 +2816,7 @@ def _get_leaf_sorter(labels):
new_levels[level] = new_level

if keep_order: # just drop missing values. o.w. keep order
left_indexer = np.arange(len(left))
left_indexer = np.arange(len(left), dtype=np.intp)
mask = new_lev_labels != -1
if not mask.all():
new_labels = [lab[mask] for lab in new_labels]
Expand Down Expand Up @@ -2859,6 +2859,10 @@ def _get_leaf_sorter(labels):
left_indexer, right_indexer = right_indexer, left_indexer

if return_indexers:
left_indexer = (None if left_indexer is None
else _ensure_platform_int(left_indexer))
right_indexer = (None if right_indexer is None
else _ensure_platform_int(right_indexer))
return join_index, left_indexer, right_indexer
else:
return join_index
Expand Down Expand Up @@ -2902,6 +2906,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False):
join_index = self._wrap_joined_index(join_index, other)

if return_indexers:
lidx = None if lidx is None else _ensure_platform_int(lidx)
ridx = None if ridx is None else _ensure_platform_int(ridx)
return join_index, lidx, ridx
else:
return join_index
Expand Down
6 changes: 3 additions & 3 deletions pandas/src/algos_common_helper.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2848,16 +2848,16 @@ def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values,
# ensure_dtype
#----------------------------------------------------------------------

cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num

cpdef ensure_platform_int(object arr):
if util.is_array(arr):
if (<ndarray> arr).descr.type_num == PLATFORM_INT:
return arr
else:
return arr.astype(np.int_)
return arr.astype(np.intp)
else:
return np.array(arr, dtype=np.int_)
return np.array(arr, dtype=np.intp)

cpdef ensure_object(object arr):
if util.is_array(arr):
Expand Down
8 changes: 4 additions & 4 deletions pandas/src/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -548,16 +548,16 @@ def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
# ensure_dtype
#----------------------------------------------------------------------

cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num

cpdef ensure_platform_int(object arr):
if util.is_array(arr):
if (<ndarray> arr).descr.type_num == PLATFORM_INT:
return arr
else:
return arr.astype(np.int_)
return arr.astype(np.intp)
else:
return np.array(arr, dtype=np.int_)
return np.array(arr, dtype=np.intp)

cpdef ensure_object(object arr):
if util.is_array(arr):
Expand Down Expand Up @@ -600,4 +600,4 @@ cpdef ensure_{{name}}(object arr):
else:
return np.array(arr, dtype=np.{{dtype}})

{{endfor}}
{{endfor}}
16 changes: 6 additions & 10 deletions pandas/src/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ float64 = np.dtype(np.float64)
cdef double NaN = <double> np.NaN
cdef double nan = NaN

from pandas.algos import groupsort_indexer
from pandas.algos import groupsort_indexer, ensure_platform_int
from pandas.core.algorithms import take_nd

include "joins_func_helper.pxi"

Expand Down Expand Up @@ -148,16 +149,14 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
# no multiple matches for any row on the left
# this is a short-cut to avoid groupsort_indexer
# otherwise, the `else` path also works in this case
if left_sorter.dtype != np.int_:
left_sorter = left_sorter.astype(np.int_)
left_sorter = ensure_platform_int(left_sorter)

rev = np.empty(len(left), dtype=np.int_)
rev = np.empty(len(left), dtype=np.intp)
rev.put(left_sorter, np.arange(len(left)))
else:
rev, _ = groupsort_indexer(left_indexer, len(left))

if rev.dtype != np.int_:
rev = rev.astype(np.int_)
rev = ensure_platform_int(rev)
right_indexer = right_indexer.take(rev)
left_indexer = left_indexer.take(rev)

Expand Down Expand Up @@ -228,11 +227,8 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,


def _get_result_indexer(sorter, indexer):
if indexer.dtype != np.int_:
indexer = indexer.astype(np.int_)
if len(sorter) > 0:
res = sorter.take(indexer)
np.putmask(res, indexer == -1, -1)
res = take_nd(sorter, indexer, fill_value=-1)
else:
# length-0 case
res = np.empty(len(indexer), dtype=np.int64)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,7 @@ def test_alignment_non_pandas(self):

align = pd.core.ops._align_method_FRAME

for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.intp)]:
for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64)]:

tm.assert_series_equal(align(df, val, 'index'),
Series([1, 2, 3], index=df.index))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def f():

def test_reindex_base(self):
idx = self.create_index()
expected = np.arange(idx.size)
expected = np.arange(idx.size, dtype=np.intp)

actual = idx.get_indexer(idx)
tm.assert_numpy_array_equal(expected, actual)
Expand Down
31 changes: 19 additions & 12 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,10 +936,10 @@ def test_get_indexer(self):
idx2 = Index([2, 4, 6])

r1 = idx1.get_indexer(idx2)
assert_almost_equal(r1, np.array([1, 3, -1]))
assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))

r1 = idx2.get_indexer(idx1, method='pad')
e1 = np.array([-1, 0, 0, 1, 1])
e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
assert_almost_equal(r1, e1)

r2 = idx2.get_indexer(idx1[::-1], method='pad')
Expand All @@ -949,7 +949,7 @@ def test_get_indexer(self):
assert_almost_equal(r1, rffill1)

r1 = idx2.get_indexer(idx1, method='backfill')
e1 = np.array([0, 0, 1, 1, 2])
e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
assert_almost_equal(r1, e1)

rbfill1 = idx2.get_indexer(idx1, method='bfill')
Expand All @@ -974,25 +974,30 @@ def test_get_indexer_nearest(self):
all_methods = ['pad', 'backfill', 'nearest']
for method in all_methods:
actual = idx.get_indexer([0, 5, 9], method=method)
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9]))
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9],
dtype=np.intp))

actual = idx.get_indexer([0, 5, 9], method=method, tolerance=0)
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9]))
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9],
dtype=np.intp))

for method, expected in zip(all_methods, [[0, 1, 8], [1, 2, 9],
[0, 2, 9]]):
actual = idx.get_indexer([0.2, 1.8, 8.5], method=method)
tm.assert_numpy_array_equal(actual, np.array(expected))
tm.assert_numpy_array_equal(actual, np.array(expected,
dtype=np.intp))

actual = idx.get_indexer([0.2, 1.8, 8.5], method=method,
tolerance=1)
tm.assert_numpy_array_equal(actual, np.array(expected))
tm.assert_numpy_array_equal(actual, np.array(expected,
dtype=np.intp))

for method, expected in zip(all_methods, [[0, -1, -1], [-1, 2, -1],
[0, 2, -1]]):
actual = idx.get_indexer([0.2, 1.8, 8.5], method=method,
tolerance=0.2)
tm.assert_numpy_array_equal(actual, np.array(expected))
tm.assert_numpy_array_equal(actual, np.array(expected,
dtype=np.intp))

with tm.assertRaisesRegexp(ValueError, 'limit argument'):
idx.get_indexer([1, 0], method='nearest', limit=1)
Expand All @@ -1003,22 +1008,24 @@ def test_get_indexer_nearest_decreasing(self):
all_methods = ['pad', 'backfill', 'nearest']
for method in all_methods:
actual = idx.get_indexer([0, 5, 9], method=method)
tm.assert_numpy_array_equal(actual, np.array([9, 4, 0]))
tm.assert_numpy_array_equal(actual, np.array([9, 4, 0],
dtype=np.intp))

for method, expected in zip(all_methods, [[8, 7, 0], [9, 8, 1],
[9, 7, 0]]):
actual = idx.get_indexer([0.2, 1.8, 8.5], method=method)
tm.assert_numpy_array_equal(actual, np.array(expected))
tm.assert_numpy_array_equal(actual, np.array(expected,
dtype=np.intp))

def test_get_indexer_strings(self):
idx = pd.Index(['b', 'c'])

actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='pad')
expected = np.array([-1, 0, 1, 1])
expected = np.array([-1, 0, 1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(actual, expected)

actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='backfill')
expected = np.array([0, 0, 1, -1])
expected = np.array([0, 0, 1, -1], dtype=np.intp)
tm.assert_numpy_array_equal(actual, expected)

with tm.assertRaises(TypeError):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def test_reindex_base(self):

# determined by cat ordering
idx = self.create_index()
expected = np.array([4, 0, 1, 5, 2, 3])
expected = np.array([4, 0, 1, 5, 2, 3], dtype=np.intp)

actual = idx.get_indexer(idx)
tm.assert_numpy_array_equal(expected, actual)
Expand Down Expand Up @@ -403,7 +403,7 @@ def test_get_indexer(self):

for indexer in [idx2, list('abf'), Index(list('abf'))]:
r1 = idx1.get_indexer(idx2)
assert_almost_equal(r1, np.array([0, 1, 2, -1]))
assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))

self.assertRaises(NotImplementedError,
lambda: idx2.get_indexer(idx1, method='pad'))
Expand Down
Loading