-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ERR: Consistent errors for non-numeric ranking. (#19560) #20670
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -859,6 +859,17 @@ def rank(values, axis=0, method='average', na_option='keep', | |
Whether or not to the display the returned rankings in integer form | ||
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). | ||
""" | ||
if is_object_dtype(values): | ||
def raise_non_numeric_error(): | ||
raise ValueError("pandas.core.algorithms.rank " | ||
"not supported for unordered " | ||
"non-numeric data") | ||
if is_categorical_dtype(values): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could be simpler, maybe
in the error message |
||
if not values.ordered: | ||
raise_non_numeric_error() | ||
else: | ||
raise_non_numeric_error() | ||
|
||
if values.ndim == 1: | ||
f, values = _get_data_algo(values, _rank1d_functions) | ||
ranks = f(values, ties_method=method, ascending=ascending, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,23 +71,22 @@ def test_rank2(self): | |
result = df.rank(0, pct=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this a separate test and move all of the cases. Ideally you could parameterize them |
||
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) | ||
result = df.rank(1, numeric_only=False) | ||
tm.assert_frame_equal(result, expected) | ||
# See #19560 | ||
error_msg = ("pandas.core.algorithms.rank " | ||
"not supported for unordered " | ||
"non-numeric data") | ||
|
||
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) | ||
result = df.rank(0, numeric_only=False) | ||
tm.assert_frame_equal(result, expected) | ||
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) | ||
with tm.assert_raises_regex(ValueError, error_msg): | ||
df.rank(1, numeric_only=False) | ||
with tm.assert_raises_regex(ValueError, error_msg): | ||
df.rank(0, numeric_only=False) | ||
|
||
df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) | ||
expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) | ||
result = df.rank(1, numeric_only=False) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) | ||
result = df.rank(0, numeric_only=False) | ||
tm.assert_frame_equal(result, expected) | ||
with tm.assert_raises_regex(ValueError, error_msg): | ||
df.rank(1, numeric_only=False) | ||
with tm.assert_raises_regex(ValueError, error_msg): | ||
df.rank(0, numeric_only=False) | ||
|
||
# f7u12, this does not work without extensive workaround | ||
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], | ||
|
@@ -110,9 +109,9 @@ def test_rank2(self): | |
self.mixed_frame['datetime'] = datetime.now() | ||
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1) | ||
|
||
result = self.mixed_frame.rank(1) | ||
expected = self.mixed_frame.rank(1, numeric_only=True) | ||
tm.assert_frame_equal(result, expected) | ||
# mixed_frame["foo"] is of string-type | ||
with tm.assert_raises_regex(ValueError, error_msg): | ||
self.mixed_frame.rank(1) | ||
|
||
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, | ||
1e60, 1e80, 1e-30]}) | ||
|
@@ -218,7 +217,7 @@ def test_rank_methods_frame(self): | |
tm.assert_frame_equal(result, expected) | ||
|
||
def test_rank_descending(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a test with assert that the object ones raise (unless it duplicates too much the above tests) |
||
dtypes = ['O', 'f8', 'i8'] | ||
dtypes = ['f8', 'i8'] | ||
|
||
for dtype, method in product(dtypes, self.results): | ||
if 'i' in dtype: | ||
|
@@ -230,15 +229,11 @@ def test_rank_descending(self): | |
expected = (df.max() - df).rank() | ||
assert_frame_equal(res, expected) | ||
|
||
if method == 'first' and dtype == 'O': | ||
continue | ||
|
||
expected = (df.max() - df).rank(method=method) | ||
|
||
if dtype != 'O': | ||
res2 = df.rank(method=method, ascending=False, | ||
numeric_only=True) | ||
assert_frame_equal(res2, expected) | ||
res2 = df.rank(method=method, ascending=False, | ||
numeric_only=True) | ||
assert_frame_equal(res2, expected) | ||
|
||
res3 = df.rank(method=method, ascending=False, | ||
numeric_only=False) | ||
|
@@ -258,11 +253,10 @@ def _check2d(df, expected, method='average', axis=0): | |
assert_frame_equal(result, exp_df) | ||
|
||
dtypes = [None, object] | ||
disabled = set([(object, 'first')]) | ||
results = self.results | ||
|
||
for method, axis, dtype in product(results, [0, 1], dtypes): | ||
if (dtype, method) in disabled: | ||
if dtype == object: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use is_object_dtype |
||
continue | ||
frame = df if dtype is None else df.astype(dtype) | ||
_check2d(frame, results[method], method=method, axis=axis) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -134,22 +134,27 @@ def test_rank_categorical(self): | |
assert_series_equal(ordered.rank(), exp) | ||
assert_series_equal(ordered.rank(ascending=False), exp_desc) | ||
|
||
# Unordered categoricals should be ranked as objects | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above |
||
# See #19560 | ||
error_msg = ("pandas.core.algorithms.rank " | ||
"not supported for unordered " | ||
"non-numeric data") | ||
|
||
# Ranking unordered categorials depreciated per #19560 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer to say "not supported" instead of "deprecated" |
||
unordered = Series(['first', 'second', 'third', 'fourth', | ||
'fifth', 'sixth']).astype( | ||
CategoricalDtype(categories=['first', 'second', 'third', | ||
'fourth', 'fifth', 'sixth'], | ||
ordered=False)) | ||
exp_unordered = Series([2., 4., 6., 3., 1., 5.]) | ||
res = unordered.rank() | ||
assert_series_equal(res, exp_unordered) | ||
|
||
with tm.assert_raises_regex(ValueError, error_msg): | ||
unordered.rank() | ||
|
||
unordered1 = Series( | ||
[1, 2, 3, 4, 5, 6], | ||
).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False)) | ||
exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) | ||
res1 = unordered1.rank() | ||
assert_series_equal(res1, exp_unordered1) | ||
|
||
# Won't raise ValueError because entries not objects. | ||
unordered1.rank() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this right? Certainly the example passes the "eye test" but if the Categorical is not ordered what semantics are we using for ranking? |
||
|
||
# Test na_option for rank data | ||
na_ser = Series( | ||
|
@@ -213,16 +218,13 @@ def test_rank_signature(self): | |
'int64', | ||
marks=pytest.mark.xfail( | ||
reason="iNaT is equivalent to minimum value of dtype" | ||
"int64 pending issue #16674")), | ||
([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()], | ||
'object') | ||
"int64 pending issue #16674")) | ||
]) | ||
def test_rank_inf(self, contents, dtype): | ||
dtype_na_map = { | ||
'float64': np.nan, | ||
'float32': np.nan, | ||
'int64': iNaT, | ||
'object': None | ||
'int64': iNaT | ||
} | ||
# Insert nans at random positions if underlying dtype has missing | ||
# value. Then adjust the expected order by adding nans accordingly | ||
|
@@ -249,13 +251,10 @@ def _check(s, expected, method='average'): | |
result = s.rank(method=method) | ||
tm.assert_series_equal(result, Series(expected)) | ||
|
||
dtypes = [None, object] | ||
disabled = set([(object, 'first')]) | ||
dtypes = [None] | ||
results = self.results | ||
|
||
for method, dtype in product(results, dtypes): | ||
if (dtype, method) in disabled: | ||
continue | ||
series = s if dtype is None else s.astype(dtype) | ||
_check(series, results[method], method=method) | ||
|
||
|
@@ -294,7 +293,7 @@ def _check(s, method, na_option, ascending): | |
for dtype, na_value, pos_inf, neg_inf in dtypes: | ||
in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk | ||
iseries = Series(in_arr, dtype=dtype) | ||
if (dtype, method) in disabled: | ||
if dtype == 'object': | ||
continue | ||
_check(iseries, method, na_option, ascending) | ||
|
||
|
@@ -330,7 +329,7 @@ def test_rank_methods_series(self): | |
tm.assert_series_equal(result, expected) | ||
|
||
def test_rank_dense_method(self): | ||
dtypes = ['O', 'f8', 'i8'] | ||
dtypes = ['f8', 'i8'] | ||
in_out = [([1], [1]), | ||
([2], [1]), | ||
([0], [1]), | ||
|
@@ -348,7 +347,7 @@ def test_rank_dense_method(self): | |
assert_series_equal(result, expected) | ||
|
||
def test_rank_descending(self): | ||
dtypes = ['O', 'f8', 'i8'] | ||
dtypes = ['f8', 'i8'] | ||
|
||
for dtype, method in product(dtypes, self.results): | ||
if 'i' in dtype: | ||
|
@@ -360,9 +359,6 @@ def test_rank_descending(self): | |
expected = (s.max() - s).rank() | ||
assert_series_equal(res, expected) | ||
|
||
if method == 'first' and dtype == 'O': | ||
continue | ||
|
||
expected = (s.max() - s).rank(method=method) | ||
res2 = s.rank(method=method, ascending=False) | ||
assert_series_equal(res2, expected) | ||
|
@@ -379,9 +375,15 @@ def test_rank_int(self): | |
def test_rank_object_bug(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Name of this test can be changed to |
||
# GH 13445 | ||
|
||
# smoke tests | ||
Series([np.nan] * 32).astype(object).rank(ascending=True) | ||
Series([np.nan] * 32).astype(object).rank(ascending=False) | ||
# See #19560 | ||
error_msg = ("pandas.core.algorithms.rank " | ||
"not supported for unordered " | ||
"non-numeric data") | ||
|
||
with tm.assert_raises_regex(ValueError, error_msg): | ||
Series([np.nan] * 32).astype(object).rank(ascending=True) | ||
with tm.assert_raises_regex(ValueError, error_msg): | ||
Series([np.nan] * 32).astype(object).rank(ascending=False) | ||
|
||
def test_rank_modify_inplace(self): | ||
# GH 18521 | ||
|
@@ -396,7 +398,7 @@ def test_rank_modify_inplace(self): | |
|
||
# GH15630, pct should be on 100% basis when method='dense' | ||
|
||
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) | ||
@pytest.mark.parametrize('dtype', ['f8', 'i8']) | ||
@pytest.mark.parametrize('ser, exp', [ | ||
([1], [1.]), | ||
([1, 2], [1. / 2, 2. / 2]), | ||
|
@@ -414,7 +416,7 @@ def test_rank_dense_pct(dtype, ser, exp): | |
assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) | ||
@pytest.mark.parametrize('dtype', ['f8', 'i8']) | ||
@pytest.mark.parametrize('ser, exp', [ | ||
([1], [1.]), | ||
([1, 2], [1. / 2, 2. / 2]), | ||
|
@@ -432,7 +434,7 @@ def test_rank_min_pct(dtype, ser, exp): | |
assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) | ||
@pytest.mark.parametrize('dtype', ['f8', 'i8']) | ||
@pytest.mark.parametrize('ser, exp', [ | ||
([1], [1.]), | ||
([1, 2], [1. / 2, 2. / 2]), | ||
|
@@ -450,7 +452,7 @@ def test_rank_max_pct(dtype, ser, exp): | |
assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) | ||
@pytest.mark.parametrize('dtype', ['f8', 'i8']) | ||
@pytest.mark.parametrize('ser, exp', [ | ||
([1], [1.]), | ||
([1, 2], [1. / 2, 2. / 2]), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move to the list of issues. use double-backticks around
ValueError
.'data frame' ->
DataFrame