Skip to content

Commit 0780443

Browse files
chris-b1jreback
authored andcommitted
PERF/COMPAT: define platform int to np.intp
AFAIK this only affects 64 bit python on Windows. `numpy` wants an `np.intp` (i8 on Windows) as a indexer for `take`, but pandas defines a "platform int" as a `np.int_` (i4 on Windows). This hits performance twice, because we often start with i8, cast to i4, then numpy will cast back to i8 in its `take`. Author: Chris <[email protected]> Closes #13972 from chris-b1/platform-int and squashes the following commits: Closes #3033 322b11a [Chris] lint fixup fc80938 [Chris] adjust for 32bit 84f38b2 [Chris] adjust test for platform independence 3ced5d5 [Chris] PERF/COMPAT: define platform int to np.intp
1 parent 5d791cc commit 0780443

22 files changed

+200
-146
lines changed

doc/source/whatsnew/v0.19.0.txt

+38
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,44 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
778778
- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`)
779779
- Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`)
780780

781+
.. _whatsnew_0190.indexer_dtype:
782+
783+
Indexer dtype Changes
784+
^^^^^^^^^^^^^^^^^^^^^
785+
786+
.. note::
787+
788+
This change only affects 64 bit python running on Windows, and only affects relatively advanced
789+
indexing operations
790+
791+
Methods such as ``Index.get_indexer`` that return an indexer array, coerce that array to a "platform int", so that it can be
792+
directly used in 3rd party library operations like ``numpy.take``. Previously, a platform int was defined as ``np.int_``
793+
which corresponds to a C integer, but the correct type, and what is being used now, is ``np.intp``, which corresponds
794+
to the C integer size that can hold a pointer. (:issue:`3033`, :issue:`13972`)
795+
796+
These types are the same on many platform, but for 64 bit python on Windows,
797+
``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many
798+
operations on that platform.
799+
800+
Previous behaviour:
801+
802+
.. code-block:: ipython
803+
804+
In [1]: i = pd.Index(['a', 'b', 'c'])
805+
806+
In [2]: i.get_indexer(['b', 'b', 'c']).dtype
807+
Out[2]: dtype('int32')
808+
809+
New behaviour:
810+
811+
.. code-block:: ipython
812+
813+
In [1]: i = pd.Index(['a', 'b', 'c'])
814+
815+
In [2]: i.get_indexer(['b', 'b', 'c']).dtype
816+
Out[2]: dtype('int64')
817+
818+
781819
.. _whatsnew_0190.deprecations:
782820

783821
Deprecations

pandas/core/algorithms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def sort_mixed(values):
259259
new_labels = reverse_indexer.take(labels, mode='wrap')
260260
np.putmask(new_labels, mask, na_sentinel)
261261

262-
return ordered, new_labels
262+
return ordered, _ensure_platform_int(new_labels)
263263

264264

265265
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):

pandas/hashtable.pyx

+6-6
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ cdef class Factorizer:
6464
mask = (labels == na_sentinel)
6565
# sort on
6666
if sort:
67-
if labels.dtype != np.int_:
68-
labels = labels.astype(np.int_)
67+
if labels.dtype != np.intp:
68+
labels = labels.astype(np.intp)
6969
sorter = self.uniques.to_array().argsort()
70-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
70+
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
7171
reverse_indexer.put(sorter, np.arange(len(sorter)))
7272
labels = reverse_indexer.take(labels, mode='clip')
7373
labels[mask] = na_sentinel
@@ -100,11 +100,11 @@ cdef class Int64Factorizer:
100100

101101
# sort on
102102
if sort:
103-
if labels.dtype != np.int_:
104-
labels = labels.astype(np.int_)
103+
if labels.dtype != np.intp:
104+
labels = labels.astype(np.intp)
105105

106106
sorter = self.uniques.to_array().argsort()
107-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
107+
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
108108
reverse_indexer.put(sorter, np.arange(len(sorter)))
109109

110110
labels = reverse_indexer.take(labels)

pandas/indexes/base.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -2820,7 +2820,7 @@ def _get_leaf_sorter(labels):
28202820
new_levels[level] = new_level
28212821

28222822
if keep_order: # just drop missing values. o.w. keep order
2823-
left_indexer = np.arange(len(left))
2823+
left_indexer = np.arange(len(left), dtype=np.intp)
28242824
mask = new_lev_labels != -1
28252825
if not mask.all():
28262826
new_labels = [lab[mask] for lab in new_labels]
@@ -2863,6 +2863,10 @@ def _get_leaf_sorter(labels):
28632863
left_indexer, right_indexer = right_indexer, left_indexer
28642864

28652865
if return_indexers:
2866+
left_indexer = (None if left_indexer is None
2867+
else _ensure_platform_int(left_indexer))
2868+
right_indexer = (None if right_indexer is None
2869+
else _ensure_platform_int(right_indexer))
28662870
return join_index, left_indexer, right_indexer
28672871
else:
28682872
return join_index
@@ -2906,6 +2910,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False):
29062910
join_index = self._wrap_joined_index(join_index, other)
29072911

29082912
if return_indexers:
2913+
lidx = None if lidx is None else _ensure_platform_int(lidx)
2914+
ridx = None if ridx is None else _ensure_platform_int(ridx)
29092915
return join_index, lidx, ridx
29102916
else:
29112917
return join_index

pandas/src/algos_common_helper.pxi

+5-3
Original file line numberDiff line numberDiff line change
@@ -2848,16 +2848,18 @@ def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values,
28482848
# ensure_dtype
28492849
#----------------------------------------------------------------------
28502850

2851-
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
2851+
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num
28522852

28532853
cpdef ensure_platform_int(object arr):
2854+
# GH3033, GH1392
2855+
# platform int is the size of the int pointer, e.g. np.intp
28542856
if util.is_array(arr):
28552857
if (<ndarray> arr).descr.type_num == PLATFORM_INT:
28562858
return arr
28572859
else:
2858-
return arr.astype(np.int_)
2860+
return arr.astype(np.intp)
28592861
else:
2860-
return np.array(arr, dtype=np.int_)
2862+
return np.array(arr, dtype=np.intp)
28612863

28622864
cpdef ensure_object(object arr):
28632865
if util.is_array(arr):

pandas/src/algos_common_helper.pxi.in

+6-4
Original file line numberDiff line numberDiff line change
@@ -548,16 +548,18 @@ def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
548548
# ensure_dtype
549549
#----------------------------------------------------------------------
550550

551-
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
551+
cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.intp)).descr.type_num
552552

553553
cpdef ensure_platform_int(object arr):
554+
# GH3033, GH1392
555+
# platform int is the size of the int pointer, e.g. np.intp
554556
if util.is_array(arr):
555557
if (<ndarray> arr).descr.type_num == PLATFORM_INT:
556558
return arr
557559
else:
558-
return arr.astype(np.int_)
560+
return arr.astype(np.intp)
559561
else:
560-
return np.array(arr, dtype=np.int_)
562+
return np.array(arr, dtype=np.intp)
561563

562564
cpdef ensure_object(object arr):
563565
if util.is_array(arr):
@@ -600,4 +602,4 @@ cpdef ensure_{{name}}(object arr):
600602
else:
601603
return np.array(arr, dtype=np.{{dtype}})
602604

603-
{{endfor}}
605+
{{endfor}}

pandas/src/join.pyx

+6-10
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ float64 = np.dtype(np.float64)
3232
cdef double NaN = <double> np.NaN
3333
cdef double nan = NaN
3434

35-
from pandas.algos import groupsort_indexer
35+
from pandas.algos import groupsort_indexer, ensure_platform_int
36+
from pandas.core.algorithms import take_nd
3637

3738
include "joins_func_helper.pxi"
3839

@@ -148,16 +149,14 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
148149
# no multiple matches for any row on the left
149150
# this is a short-cut to avoid groupsort_indexer
150151
# otherwise, the `else` path also works in this case
151-
if left_sorter.dtype != np.int_:
152-
left_sorter = left_sorter.astype(np.int_)
152+
left_sorter = ensure_platform_int(left_sorter)
153153

154-
rev = np.empty(len(left), dtype=np.int_)
154+
rev = np.empty(len(left), dtype=np.intp)
155155
rev.put(left_sorter, np.arange(len(left)))
156156
else:
157157
rev, _ = groupsort_indexer(left_indexer, len(left))
158158

159-
if rev.dtype != np.int_:
160-
rev = rev.astype(np.int_)
159+
rev = ensure_platform_int(rev)
161160
right_indexer = right_indexer.take(rev)
162161
left_indexer = left_indexer.take(rev)
163162

@@ -228,11 +227,8 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
228227

229228

230229
def _get_result_indexer(sorter, indexer):
231-
if indexer.dtype != np.int_:
232-
indexer = indexer.astype(np.int_)
233230
if len(sorter) > 0:
234-
res = sorter.take(indexer)
235-
np.putmask(res, indexer == -1, -1)
231+
res = take_nd(sorter, indexer, fill_value=-1)
236232
else:
237233
# length-0 case
238234
res = np.empty(len(indexer), dtype=np.int64)

pandas/tests/frame/test_operators.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1204,7 +1204,7 @@ def test_alignment_non_pandas(self):
12041204

12051205
align = pd.core.ops._align_method_FRAME
12061206

1207-
for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.intp)]:
1207+
for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64)]:
12081208

12091209
tm.assert_series_equal(align(df, val, 'index'),
12101210
Series([1, 2, 3], index=df.index))

pandas/tests/indexes/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def f():
110110

111111
def test_reindex_base(self):
112112
idx = self.create_index()
113-
expected = np.arange(idx.size)
113+
expected = np.arange(idx.size, dtype=np.intp)
114114

115115
actual = idx.get_indexer(idx)
116116
tm.assert_numpy_array_equal(expected, actual)

pandas/tests/indexes/test_base.py

+19-12
Original file line numberDiff line numberDiff line change
@@ -975,10 +975,10 @@ def test_get_indexer(self):
975975
idx2 = Index([2, 4, 6])
976976

977977
r1 = idx1.get_indexer(idx2)
978-
assert_almost_equal(r1, np.array([1, 3, -1]))
978+
assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))
979979

980980
r1 = idx2.get_indexer(idx1, method='pad')
981-
e1 = np.array([-1, 0, 0, 1, 1])
981+
e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
982982
assert_almost_equal(r1, e1)
983983

984984
r2 = idx2.get_indexer(idx1[::-1], method='pad')
@@ -988,7 +988,7 @@ def test_get_indexer(self):
988988
assert_almost_equal(r1, rffill1)
989989

990990
r1 = idx2.get_indexer(idx1, method='backfill')
991-
e1 = np.array([0, 0, 1, 1, 2])
991+
e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
992992
assert_almost_equal(r1, e1)
993993

994994
rbfill1 = idx2.get_indexer(idx1, method='bfill')
@@ -1013,25 +1013,30 @@ def test_get_indexer_nearest(self):
10131013
all_methods = ['pad', 'backfill', 'nearest']
10141014
for method in all_methods:
10151015
actual = idx.get_indexer([0, 5, 9], method=method)
1016-
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9]))
1016+
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9],
1017+
dtype=np.intp))
10171018

10181019
actual = idx.get_indexer([0, 5, 9], method=method, tolerance=0)
1019-
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9]))
1020+
tm.assert_numpy_array_equal(actual, np.array([0, 5, 9],
1021+
dtype=np.intp))
10201022

10211023
for method, expected in zip(all_methods, [[0, 1, 8], [1, 2, 9],
10221024
[0, 2, 9]]):
10231025
actual = idx.get_indexer([0.2, 1.8, 8.5], method=method)
1024-
tm.assert_numpy_array_equal(actual, np.array(expected))
1026+
tm.assert_numpy_array_equal(actual, np.array(expected,
1027+
dtype=np.intp))
10251028

10261029
actual = idx.get_indexer([0.2, 1.8, 8.5], method=method,
10271030
tolerance=1)
1028-
tm.assert_numpy_array_equal(actual, np.array(expected))
1031+
tm.assert_numpy_array_equal(actual, np.array(expected,
1032+
dtype=np.intp))
10291033

10301034
for method, expected in zip(all_methods, [[0, -1, -1], [-1, 2, -1],
10311035
[0, 2, -1]]):
10321036
actual = idx.get_indexer([0.2, 1.8, 8.5], method=method,
10331037
tolerance=0.2)
1034-
tm.assert_numpy_array_equal(actual, np.array(expected))
1038+
tm.assert_numpy_array_equal(actual, np.array(expected,
1039+
dtype=np.intp))
10351040

10361041
with tm.assertRaisesRegexp(ValueError, 'limit argument'):
10371042
idx.get_indexer([1, 0], method='nearest', limit=1)
@@ -1042,22 +1047,24 @@ def test_get_indexer_nearest_decreasing(self):
10421047
all_methods = ['pad', 'backfill', 'nearest']
10431048
for method in all_methods:
10441049
actual = idx.get_indexer([0, 5, 9], method=method)
1045-
tm.assert_numpy_array_equal(actual, np.array([9, 4, 0]))
1050+
tm.assert_numpy_array_equal(actual, np.array([9, 4, 0],
1051+
dtype=np.intp))
10461052

10471053
for method, expected in zip(all_methods, [[8, 7, 0], [9, 8, 1],
10481054
[9, 7, 0]]):
10491055
actual = idx.get_indexer([0.2, 1.8, 8.5], method=method)
1050-
tm.assert_numpy_array_equal(actual, np.array(expected))
1056+
tm.assert_numpy_array_equal(actual, np.array(expected,
1057+
dtype=np.intp))
10511058

10521059
def test_get_indexer_strings(self):
10531060
idx = pd.Index(['b', 'c'])
10541061

10551062
actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='pad')
1056-
expected = np.array([-1, 0, 1, 1])
1063+
expected = np.array([-1, 0, 1, 1], dtype=np.intp)
10571064
tm.assert_numpy_array_equal(actual, expected)
10581065

10591066
actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='backfill')
1060-
expected = np.array([0, 0, 1, -1])
1067+
expected = np.array([0, 0, 1, -1], dtype=np.intp)
10611068
tm.assert_numpy_array_equal(actual, expected)
10621069

10631070
with tm.assertRaises(TypeError):

pandas/tests/indexes/test_category.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ def test_reindex_base(self):
336336

337337
# determined by cat ordering
338338
idx = self.create_index()
339-
expected = np.array([4, 0, 1, 5, 2, 3])
339+
expected = np.array([4, 0, 1, 5, 2, 3], dtype=np.intp)
340340

341341
actual = idx.get_indexer(idx)
342342
tm.assert_numpy_array_equal(expected, actual)
@@ -403,7 +403,7 @@ def test_get_indexer(self):
403403

404404
for indexer in [idx2, list('abf'), Index(list('abf'))]:
405405
r1 = idx1.get_indexer(idx2)
406-
assert_almost_equal(r1, np.array([0, 1, 2, -1]))
406+
assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))
407407

408408
self.assertRaises(NotImplementedError,
409409
lambda: idx2.get_indexer(idx1, method='pad'))

0 commit comments

Comments
 (0)