Skip to content

ERR: Consistent errors for non-numeric ranking. (#19560) #20670

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,8 @@ Other Enhancements
Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

- Using :func:`DataFrame.rank` on a data frame with non-numeric entries other than ordered categoricals will raise a ValueError.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to the list of issues. use double-backticks around ValueError.

'data frame' -> DataFrame


.. _whatsnew_0230.api_breaking.deps:

Dependencies have increased minimum versions
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,17 @@ def rank(values, axis=0, method='average', na_option='keep',
Whether or not to the display the returned rankings in integer form
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
"""
if is_object_dtype(values):
def raise_non_numeric_error():
raise ValueError("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")
if is_categorical_dtype(values):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be simpler, maybe

if is_object_dtype(values) and not (is_categorical_dtype(values) and values.ordered):
    raise ...

in the error message
just say ".rank().format(type(value).__name__)"

if not values.ordered:
raise_non_numeric_error()
else:
raise_non_numeric_error()

if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
ranks = f(values, ties_method=method, ascending=ascending,
Expand Down
48 changes: 21 additions & 27 deletions pandas/tests/frame/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,23 +71,22 @@ def test_rank2(self):
result = df.rank(0, pct=True)
tm.assert_frame_equal(result, expected)

df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this a separate test and move all of the cases. Ideally you could parameterize them

expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
# See #19560
error_msg = ("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")

expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(1, numeric_only=False)
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(0, numeric_only=False)

df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)

expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(1, numeric_only=False)
with tm.assert_raises_regex(ValueError, error_msg):
df.rank(0, numeric_only=False)

# f7u12, this does not work without extensive workaround
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
Expand All @@ -110,9 +109,9 @@ def test_rank2(self):
self.mixed_frame['datetime'] = datetime.now()
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)

result = self.mixed_frame.rank(1)
expected = self.mixed_frame.rank(1, numeric_only=True)
tm.assert_frame_equal(result, expected)
# mixed_frame["foo"] is of string-type
with tm.assert_raises_regex(ValueError, error_msg):
self.mixed_frame.rank(1)

df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
1e60, 1e80, 1e-30]})
Expand Down Expand Up @@ -218,7 +217,7 @@ def test_rank_methods_frame(self):
tm.assert_frame_equal(result, expected)

def test_rank_descending(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test with assert that the object ones raise (unless it duplicates too much the above tests)

dtypes = ['O', 'f8', 'i8']
dtypes = ['f8', 'i8']

for dtype, method in product(dtypes, self.results):
if 'i' in dtype:
Expand All @@ -230,15 +229,11 @@ def test_rank_descending(self):
expected = (df.max() - df).rank()
assert_frame_equal(res, expected)

if method == 'first' and dtype == 'O':
continue

expected = (df.max() - df).rank(method=method)

if dtype != 'O':
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)

res3 = df.rank(method=method, ascending=False,
numeric_only=False)
Expand All @@ -258,11 +253,10 @@ def _check2d(df, expected, method='average', axis=0):
assert_frame_equal(result, exp_df)

dtypes = [None, object]
disabled = set([(object, 'first')])
results = self.results

for method, axis, dtype in product(results, [0, 1], dtypes):
if (dtype, method) in disabled:
if dtype == object:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use is_object_dtype

continue
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, results[method], method=method, axis=axis)
Expand Down
60 changes: 31 additions & 29 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,22 +134,27 @@ def test_rank_categorical(self):
assert_series_equal(ordered.rank(), exp)
assert_series_equal(ordered.rank(ascending=False), exp_desc)

# Unordered categoricals should be ranked as objects
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

# See #19560
error_msg = ("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")

# Ranking unordered categorials depreciated per #19560
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to say "not supported" instead of "deprecated"

unordered = Series(['first', 'second', 'third', 'fourth',
'fifth', 'sixth']).astype(
CategoricalDtype(categories=['first', 'second', 'third',
'fourth', 'fifth', 'sixth'],
ordered=False))
exp_unordered = Series([2., 4., 6., 3., 1., 5.])
res = unordered.rank()
assert_series_equal(res, exp_unordered)

with tm.assert_raises_regex(ValueError, error_msg):
unordered.rank()

unordered1 = Series(
[1, 2, 3, 4, 5, 6],
).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False))
exp_unordered1 = Series([1., 2., 3., 4., 5., 6.])
res1 = unordered1.rank()
assert_series_equal(res1, exp_unordered1)

# Won't raise ValueError because entries not objects.
unordered1.rank()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this right? Certainly the example passes the "eye test" but if the Categorical is not ordered what semantics are we using for ranking?


# Test na_option for rank data
na_ser = Series(
Expand Down Expand Up @@ -213,16 +218,13 @@ def test_rank_signature(self):
'int64',
marks=pytest.mark.xfail(
reason="iNaT is equivalent to minimum value of dtype"
"int64 pending issue #16674")),
([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()],
'object')
"int64 pending issue #16674"))
])
def test_rank_inf(self, contents, dtype):
dtype_na_map = {
'float64': np.nan,
'float32': np.nan,
'int64': iNaT,
'object': None
'int64': iNaT
}
# Insert nans at random positions if underlying dtype has missing
# value. Then adjust the expected order by adding nans accordingly
Expand All @@ -249,13 +251,10 @@ def _check(s, expected, method='average'):
result = s.rank(method=method)
tm.assert_series_equal(result, Series(expected))

dtypes = [None, object]
disabled = set([(object, 'first')])
dtypes = [None]
results = self.results

for method, dtype in product(results, dtypes):
if (dtype, method) in disabled:
continue
series = s if dtype is None else s.astype(dtype)
_check(series, results[method], method=method)

Expand Down Expand Up @@ -294,7 +293,7 @@ def _check(s, method, na_option, ascending):
for dtype, na_value, pos_inf, neg_inf in dtypes:
in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
iseries = Series(in_arr, dtype=dtype)
if (dtype, method) in disabled:
if dtype == 'object':
continue
_check(iseries, method, na_option, ascending)

Expand Down Expand Up @@ -330,7 +329,7 @@ def test_rank_methods_series(self):
tm.assert_series_equal(result, expected)

def test_rank_dense_method(self):
dtypes = ['O', 'f8', 'i8']
dtypes = ['f8', 'i8']
in_out = [([1], [1]),
([2], [1]),
([0], [1]),
Expand All @@ -348,7 +347,7 @@ def test_rank_dense_method(self):
assert_series_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']
dtypes = ['f8', 'i8']

for dtype, method in product(dtypes, self.results):
if 'i' in dtype:
Expand All @@ -360,9 +359,6 @@ def test_rank_descending(self):
expected = (s.max() - s).rank()
assert_series_equal(res, expected)

if method == 'first' and dtype == 'O':
continue

expected = (s.max() - s).rank(method=method)
res2 = s.rank(method=method, ascending=False)
assert_series_equal(res2, expected)
Expand All @@ -379,9 +375,15 @@ def test_rank_int(self):
def test_rank_object_bug(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Name of this test can be changed to test_rank_na_object_raises

# GH 13445

# smoke tests
Series([np.nan] * 32).astype(object).rank(ascending=True)
Series([np.nan] * 32).astype(object).rank(ascending=False)
# See #19560
error_msg = ("pandas.core.algorithms.rank "
"not supported for unordered "
"non-numeric data")

with tm.assert_raises_regex(ValueError, error_msg):
Series([np.nan] * 32).astype(object).rank(ascending=True)
with tm.assert_raises_regex(ValueError, error_msg):
Series([np.nan] * 32).astype(object).rank(ascending=False)

def test_rank_modify_inplace(self):
# GH 18521
Expand All @@ -396,7 +398,7 @@ def test_rank_modify_inplace(self):

# GH15630, pct should be on 100% basis when method='dense'

@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand All @@ -414,7 +416,7 @@ def test_rank_dense_pct(dtype, ser, exp):
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand All @@ -432,7 +434,7 @@ def test_rank_min_pct(dtype, ser, exp):
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand All @@ -450,7 +452,7 @@ def test_rank_max_pct(dtype, ser, exp):
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
Expand Down