Skip to content

REF: avoid getattr pattern for rank_1d, rank_2d #29137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 5 additions & 17 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
ranked_mat = np.empty((N, K), dtype=np.float64)

for i in range(K):
ranked_mat[:, i] = rank_1d_float64(mat[:, i])
ranked_mat[:, i] = rank_1d(mat[:, i])

for xi in range(K):
for yi in range(xi + 1):
Expand All @@ -337,8 +337,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
j += 1

if not all_ranks:
maskedx = rank_1d_float64(maskedx)
maskedy = rank_1d_float64(maskedy)
maskedx = rank_1d(maskedx)
maskedy = rank_1d(maskedy)

mean = (nobs + 1) / 2.

Expand Down Expand Up @@ -1005,12 +1005,6 @@ def rank_1d(rank_t[:] in_arr, ties_method='average',
return ranks


rank_1d_object = rank_1d["object"]
rank_1d_float64 = rank_1d["float64_t"]
rank_1d_uint64 = rank_1d["uint64_t"]
rank_1d_int64 = rank_1d["int64_t"]


def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average',
ascending=True, na_option='keep', pct=False):
"""
Expand Down Expand Up @@ -1083,8 +1077,8 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average',
except TypeError:
values = in_arr
for i in range(len(values)):
ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method,
ascending=ascending, pct=pct)
ranks[i] = rank_1d(in_arr[i], ties_method=ties_method,
ascending=ascending, pct=pct)
if axis == 0:
return ranks.T
else:
Expand Down Expand Up @@ -1179,12 +1173,6 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average',
return ranks


rank_2d_object = rank_2d["object"]
rank_2d_float64 = rank_2d["float64_t"]
rank_2d_uint64 = rank_2d["uint64_t"]
rank_2d_int64 = rank_2d["int64_t"]


# generated from template
include "algos_common_helper.pxi"
include "algos_take_helper.pxi"
27 changes: 16 additions & 11 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ def _group_add(complexfloating_t[:, :] out,
if len(values) != len(labels):
raise ValueError("len(index) != len(labels)")

nobs = np.zeros((len(out), out.shape[1]), dtype=np.int64)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the object cast buy us here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://groups.google.com/forum/#!topic/cython-users/xheRGGOiuYw

For reasons unknown, out.shape is evaluated as an 8-tuple instead of 2-tuple, and this call raises at runtime without the <object> cast. That's the reason why the status quo doesn't use out.shape directly on L458.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty interesting...and cast to object here doesn't trigger any underlying functions? Not sure if it just changes the type of pointer Cython uses or if it actually changes the memview

sumx = np.zeros_like(out)

N, K = (<object>values).shape
Expand Down Expand Up @@ -507,12 +507,13 @@ def _group_prod(floating[:, :] out,
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, count
floating[:, :] prodx, nobs
floating[:, :] prodx
int64_t[:, :] nobs

if not len(values) == len(labels):
raise ValueError("len(index) != len(labels)")

nobs = np.zeros_like(out)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
prodx = np.ones_like(out)

N, K = (<object>values).shape
Expand Down Expand Up @@ -555,14 +556,15 @@ def _group_var(floating[:, :] out,
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, ct, oldmean
floating[:, :] nobs, mean
floating[:, :] mean
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise ValueError("len(index) != len(labels)")

nobs = np.zeros_like(out)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
mean = np.zeros_like(out)

N, K = (<object>values).shape
Expand Down Expand Up @@ -610,14 +612,15 @@ def _group_mean(floating[:, :] out,
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, count
floating[:, :] sumx, nobs
floating[:, :] sumx
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise ValueError("len(index) != len(labels)")

nobs = np.zeros_like(out)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
sumx = np.zeros_like(out)

N, K = (<object>values).shape
Expand Down Expand Up @@ -1243,15 +1246,16 @@ def group_max(groupby_t[:, :] out,
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
groupby_t val, count, nan_val
ndarray[groupby_t, ndim=2] maxx, nobs
ndarray[groupby_t, ndim=2] maxx
bint runtime_error = False
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)
nobs = np.zeros((<object>out).shape, dtype=np.int64)

maxx = np.empty_like(out)
if groupby_t is int64_t:
Expand Down Expand Up @@ -1314,15 +1318,16 @@ def group_min(groupby_t[:, :] out,
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
groupby_t val, count, nan_val
ndarray[groupby_t, ndim=2] minx, nobs
ndarray[groupby_t, ndim=2] minx
bint runtime_error = False
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)
nobs = np.zeros((<object>out).shape, dtype=np.int64)

minx = np.empty_like(out)
if groupby_t is int64_t:
Expand Down
31 changes: 11 additions & 20 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,17 @@ def _get_hashtable_algo(values):
return (htable, table, values, dtype, ndtype)


def _get_data_algo(values, func_map):
def _get_values_for_rank(values):
if is_categorical_dtype(values):
values = values._values_for_rank()

values, dtype, ndtype = _ensure_data(values)
return values, dtype, ndtype


def _get_data_algo(values, func_map):
values, dtype, ndtype = _get_values_for_rank(values)

if ndtype == "object":

# it's cheaper to use a String Hash Table than Object; we infer
Expand Down Expand Up @@ -900,17 +906,17 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
"""
if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
ranks = f(
values, _, _ = _get_values_for_rank(values)
ranks = algos.rank_1d(
values,
ties_method=method,
ascending=ascending,
na_option=na_option,
pct=pct,
)
elif values.ndim == 2:
f, values = _get_data_algo(values, _rank2d_functions)
ranks = f(
values, _, _ = _get_values_for_rank(values)
ranks = algos.rank_2d(
values,
axis=axis,
ties_method=method,
Expand Down Expand Up @@ -1000,21 +1006,6 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
return arr + b


_rank1d_functions = {
"float64": algos.rank_1d_float64,
"int64": algos.rank_1d_int64,
"uint64": algos.rank_1d_uint64,
"object": algos.rank_1d_object,
}

_rank2d_functions = {
"float64": algos.rank_2d_float64,
"int64": algos.rank_2d_int64,
"uint64": algos.rank_2d_uint64,
"object": algos.rank_2d_object,
}


def quantile(x, q, interpolation_method="fraction"):
"""
Compute sample quantile or quantiles of the input array. For example, q=0.5
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1619,7 +1619,7 @@ def test_scipy_compat(self):
def _check(arr):
mask = ~np.isfinite(arr)
arr = arr.copy()
result = libalgos.rank_1d_float64(arr)
result = libalgos.rank_1d(arr)
arr[mask] = np.inf
exp = rankdata(arr)
exp[mask] = np.nan
Expand Down