Skip to content

CLN/STY: pandas/_libs/internals.pyx #32801

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Mar 26, 2020
Merged
54 changes: 30 additions & 24 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ cdef class BlockPlacement:
cdef:
slice _as_slice
object _as_array

bint _has_slice, _has_array, _is_known_slice_like

def __init__(self, val):
Expand Down Expand Up @@ -52,19 +51,21 @@ cdef class BlockPlacement:
def __str__(self) -> str:
cdef:
slice s = self._ensure_has_slice()

if s is not None:
v = self._as_slice
else:
v = self._as_array

return f'{type(self).__name__}({v})'
return f"{type(self).__name__}({v})"

def __repr__(self) -> str:
return str(self)

def __len__(self) -> int:
cdef:
slice s = self._ensure_has_slice()

if s is not None:
return slice_len(s)
else:
Expand All @@ -74,55 +75,63 @@ cdef class BlockPlacement:
cdef:
slice s = self._ensure_has_slice()
Py_ssize_t start, stop, step, _

if s is not None:
start, stop, step, _ = slice_get_indices_ex(s)
return iter(range(start, stop, step))
else:
return iter(self._as_array)

return iter(self._as_array)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche This is a very similar change to the change that was mentioned here, would you like me to revert that as well?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here (see comment below)


@property
def as_slice(self) -> slice:
cdef:
slice s = self._ensure_has_slice()
if s is None:
raise TypeError('Not slice-like')
else:

if s is not None:
return s

raise TypeError("Not slice-like")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also this change I personally don't find an improvement in code flow


@property
def indexer(self):
cdef:
slice s = self._ensure_has_slice()

if s is not None:
return s
else:
return self._as_array
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche This is a very similar change to the change that was mentioned here, would you like me to revert that as well?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, my opinion is only a single one, but yes, I would personally revert this as well


return self._as_array

def isin(self, arr):
from pandas.core.indexes.api import Int64Index

return Int64Index(self.as_array, copy=False).isin(arr)

@property
def as_array(self):
cdef:
Py_ssize_t start, stop, end, _

if not self._has_array:
start, stop, step, _ = slice_get_indices_ex(self._as_slice)
# NOTE: this is the C-optimized equivalent of
# np.arange(start, stop, step, dtype=np.int64)
# `np.arange(start, stop, step, dtype=np.int64)`
self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64)
self._has_array = True

return self._as_array

@property
def is_slice_like(self) -> bool:
cdef:
slice s = self._ensure_has_slice()

return s is not None

def __getitem__(self, loc):
cdef:
slice s = self._ensure_has_slice()

if s is not None:
val = slice_getitem(s, loc)
else:
Expand All @@ -137,11 +146,12 @@ cdef class BlockPlacement:
return BlockPlacement(np.delete(self.as_array, loc, axis=0))

def append(self, others):
if len(others) == 0:
if not len(others):
return self

return BlockPlacement(np.concatenate([self.as_array] +
[o.as_array for o in others]))
return BlockPlacement(
np.concatenate([self.as_array] + [o.as_array for o in others])
)

cdef iadd(self, other):
cdef:
Expand All @@ -159,8 +169,7 @@ cdef class BlockPlacement:
start += other_int
stop += other_int

if ((step > 0 and start < 0) or
(step < 0 and stop < step)):
if (step > 0 and start < 0) or (step < 0 and stop < step):
raise ValueError("iadd causes length change")

if stop < 0:
Expand All @@ -187,6 +196,7 @@ cdef class BlockPlacement:
if not self._has_slice:
self._as_slice = indexer_as_slice(self._as_array)
self._has_slice = True

return self._as_slice


Expand Down Expand Up @@ -236,8 +246,7 @@ cdef slice slice_canonize(slice s):
return slice(start, stop, step)


cpdef Py_ssize_t slice_len(
slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1:
cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1:
"""
Get length of a bounded slice.

Expand All @@ -254,8 +263,7 @@ cpdef Py_ssize_t slice_len(
if slc is None:
raise TypeError("slc must be slice")

PySlice_GetIndicesEx(slc, objlen,
&start, &stop, &step, &length)
PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)

return length

Expand All @@ -273,8 +281,7 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX):
if slc is None:
raise TypeError("slc should be a slice")

PySlice_GetIndicesEx(slc, objlen,
&start, &stop, &step, &length)
PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)

return start, stop, step, length

Expand Down Expand Up @@ -374,8 +381,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True):
# blockno handling.
cdef:
int64_t cur_blkno
Py_ssize_t i, start, stop, n, diff

Py_ssize_t i, start, stop, n, diff, tot_len
object blkno
object group_dict = defaultdict(list)
int64_t[:] res_view
Expand All @@ -388,7 +394,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True):
start = 0
cur_blkno = blknos[start]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doing this here instead of above means we can avoid the assignment in the n==0 case


if group is False:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if group is False should be marginally faster than if not group

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my benchmarking it's actually the other way around:

In [1]: foo = True                                                                                            

In [2]: %timeit foo is False                                                                                  
22.9 ns ± 0.157 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [3]: %timeit foo is False                                                                                  
23 ns ± 0.0388 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [4]: %timeit foo is False                                                                                  
22.7 ns ± 0.194 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [5]: %timeit not foo                                                                                       
20.7 ns ± 0.0589 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [6]: %timeit not foo                                                                                       
21.1 ns ± 0.0636 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [7]: %timeit not foo                                                                                       
20.6 ns ± 0.0497 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [1]: bar = False                                                                                           

In [2]: %timeit bar is False                                                                                  
28.5 ns ± 0.077 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [3]: %timeit bar is False                                                                                  
28.6 ns ± 0.0666 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [4]: %timeit bar is False                                                                                  
29.5 ns ± 0.203 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [5]: %timeit not bar                                                                                       
26.1 ns ± 0.217 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [6]: %timeit not bar                                                                                       
25.8 ns ± 0.0231 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [7]: %timeit not bar                                                                                       
26.7 ns ± 0.0831 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

look at the generated C code. The "is" should be a pointer comparison, whereas the other one should have to do some more work.

if not group:
for i in range(1, n):
if blknos[i] != cur_blkno:
yield cur_blkno, slice(start, i)
Expand Down