Skip to content

Commit 3ced5d5

Browse files
committed
PERF/COMPAT: define platform int to np.intp
1 parent 4a80521 commit 3ced5d5

File tree

5 files changed

+49
-19
lines changed

5 files changed

+49
-19
lines changed

doc/source/whatsnew/v0.19.0.txt

+34
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,40 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
767767
- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`)
768768
- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`)
769769

770+
Indexer dtype Changes
771+
^^^^^^^^^^^^^^^^^^^^^
772+
773+
.. note::
774+
775+
This change only affects 64 bit python running on Windows, and only affects relatively advanced
776+
indexing operations
777+
778+
Methods such as ``Index.get_indexer`` that return an indexer array coerce that array to a "platform int", so that it can be
779+
directly used in 3rd party library operations like ``numpy.take``. Previously, a platform int was defined as ``np.int_``
780+
which corresponds to a C integer - but the correct type, and what is being used now, is ``np.intp``, which corresponds
781+
to the C integer size that can hold a pointer.
782+
783+
These types are the same on many platform, but for 64 bit python on Windows,
784+
``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many
785+
operations on that platform.
786+
787+
Previous behaviour:
788+
789+
.. code-block:: ipython
790+
791+
In [1]: i = pd.Index(['a', 'b', 'c'])
792+
793+
In [2]: i.get_indexer(['b', 'b', 'c']).dtype
794+
Out[2]: dtype('int32')
795+
796+
New behaviour:
797+
798+
.. ipython :: python
799+
800+
i = pd.Index(['a', 'b', 'c'])
801+
i.get_indexer(['b', 'b', 'c']).dtype
802+
803+
770804
.. _whatsnew_0190.deprecations:
771805

772806
Deprecations

pandas/src/algos_common_helper.pxi

+3-3
Original file line numberDiff line numberDiff line change
@@ -2848,16 +2848,16 @@ def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values,
28482848
# ensure_dtype
28492849
#----------------------------------------------------------------------
28502850

2851-
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
2851+
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num
28522852

28532853
cpdef ensure_platform_int(object arr):
28542854
if util.is_array(arr):
28552855
if (<ndarray> arr).descr.type_num == PLATFORM_INT:
28562856
return arr
28572857
else:
2858-
return arr.astype(np.int_)
2858+
return arr.astype(np.intp)
28592859
else:
2860-
return np.array(arr, dtype=np.int_)
2860+
return np.array(arr, dtype=np.intp)
28612861

28622862
cpdef ensure_object(object arr):
28632863
if util.is_array(arr):

pandas/src/algos_common_helper.pxi.in

+4-4
Original file line numberDiff line numberDiff line change
@@ -548,16 +548,16 @@ def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
548548
# ensure_dtype
549549
#----------------------------------------------------------------------
550550

551-
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
551+
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num
552552

553553
cpdef ensure_platform_int(object arr):
554554
if util.is_array(arr):
555555
if (<ndarray> arr).descr.type_num == PLATFORM_INT:
556556
return arr
557557
else:
558-
return arr.astype(np.int_)
558+
return arr.astype(np.intp)
559559
else:
560-
return np.array(arr, dtype=np.int_)
560+
return np.array(arr, dtype=np.intp)
561561

562562
cpdef ensure_object(object arr):
563563
if util.is_array(arr):
@@ -600,4 +600,4 @@ cpdef ensure_{{name}}(object arr):
600600
else:
601601
return np.array(arr, dtype=np.{{dtype}})
602602

603-
{{endfor}}
603+
{{endfor}}

pandas/src/join.pyx

+6-10
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ float64 = np.dtype(np.float64)
3232
cdef double NaN = <double> np.NaN
3333
cdef double nan = NaN
3434

35-
from pandas.algos import groupsort_indexer
35+
from pandas.algos import groupsort_indexer, ensure_platform_int
36+
from pandas.core.algorithms import take_nd
3637

3738
include "joins_func_helper.pxi"
3839

@@ -148,16 +149,14 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
148149
# no multiple matches for any row on the left
149150
# this is a short-cut to avoid groupsort_indexer
150151
# otherwise, the `else` path also works in this case
151-
if left_sorter.dtype != np.int_:
152-
left_sorter = left_sorter.astype(np.int_)
152+
left_sorter = ensure_platform_int(left_sorter)
153153

154-
rev = np.empty(len(left), dtype=np.int_)
154+
rev = np.empty(len(left), dtype=np.intp)
155155
rev.put(left_sorter, np.arange(len(left)))
156156
else:
157157
rev, _ = groupsort_indexer(left_indexer, len(left))
158158

159-
if rev.dtype != np.int_:
160-
rev = rev.astype(np.int_)
159+
rev = ensure_platform_int(rev)
161160
right_indexer = right_indexer.take(rev)
162161
left_indexer = left_indexer.take(rev)
163162

@@ -228,11 +227,8 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
228227

229228

230229
def _get_result_indexer(sorter, indexer):
231-
if indexer.dtype != np.int_:
232-
indexer = indexer.astype(np.int_)
233230
if len(sorter) > 0:
234-
res = sorter.take(indexer)
235-
np.putmask(res, indexer == -1, -1)
231+
res = take_nd(sorter, indexer, fill_value=-1)
236232
else:
237233
# length-0 case
238234
res = np.empty(len(indexer), dtype=np.int64)

pandas/tools/merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
572572
if name in self.left:
573573

574574
if left_has_missing is None:
575-
left_has_missing = any(left_indexer == -1)
575+
left_has_missing = (left_indexer == -1).any()
576576

577577
if left_has_missing:
578578
take_right = self.right_join_keys[i]
@@ -584,7 +584,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
584584
elif name in self.right:
585585

586586
if right_has_missing is None:
587-
right_has_missing = any(right_indexer == -1)
587+
right_has_missing = (right_indexer == -1).any()
588588

589589
if right_has_missing:
590590
take_left = self.left_join_keys[i]

0 commit comments

Comments
 (0)