-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Add fix to raise error when category value 'x' is not predefined but is assigned through df.loc[..]=x #34011
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 58 commits
4487bec
10098ab
cb34580
1622663
c627fa6
ba3a751
51dcdfe
9057b26
06fdc3e
8c8f794
582c023
730fc2b
c275eb9
944ae24
8372bdb
0e5e418
eea359a
5f72d4e
781322a
26f474b
215943e
5bacde9
96e4318
993be66
42d968b
e7ce246
31ef609
ce3f463
a825269
3816789
8651d25
72726a0
0f738c3
51e2032
7d64357
d68f215
5ea8ab1
b08efc1
69f4e62
4c33040
e936736
8031f8f
1e1c094
c889b1b
c08c6c0
e6c3a4c
c862d99
5baa314
cb5d8e4
7d7da20
ecad50f
ab5af93
0c6b68a
4197a74
4bc05c6
af5e141
6d45570
31612ed
4a2a8e8
dd7e3ca
6d9e667
e0da655
92d1f14
9b9b382
d3df994
41aa9e3
ca0eb1f
e2cfb79
931d6c8
8065ddb
5d533dd
b21326b
335fc06
950dcc4
2ee1df8
17120f0
439b49f
c6e3435
fc40817
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
is_sparse, | ||
) | ||
from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries | ||
from pandas.core.dtypes.missing import isna | ||
|
||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.arrays.sparse import SparseArray | ||
|
@@ -61,6 +62,70 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: | |
return arr.astype(dtype, copy=False) | ||
|
||
|
||
def _can_cast_to_categorical(to_cast): | ||
""" | ||
Evaluates if a list of arrays can be casted to a single categorical dtype. | ||
The categorical dtype to cast to, is determined by any of the arrays which | ||
is already of categorical dtype. If no such array exists, or if the existing | ||
categorical dtype does not contain any of the unique values of the other arrays, | ||
then it will return False. | ||
|
||
Parameters | ||
---------- | ||
to_cast : array of arrays | ||
|
||
Returns | ||
------- | ||
True if possible to cast to a single categorical dtype, False otherwise. | ||
""" | ||
if len(to_cast) == 0: | ||
raise ValueError("No arrays to cast") | ||
|
||
types = [x.dtype for x in to_cast] | ||
|
||
# If any of the arrays is of categorical dtype, then we will use it as a reference. | ||
# If no such array exists, then we just return. | ||
if any(is_categorical_dtype(t) for t in types): | ||
cat_dtypes = [] | ||
for t in types: | ||
if ( | ||
is_categorical_dtype(t) | ||
and len(t.categories.values) > 0 | ||
and any(~isna(t.categories.values)) | ||
): | ||
categorical_values_dtype = t.categories.values.dtype | ||
if all( | ||
is_categorical_dtype(x) or np.can_cast(categorical_values_dtype, x) | ||
for x in types | ||
): | ||
cat_dtypes.append(t) | ||
if len(cat_dtypes) == 0 or any( | ||
not is_dtype_equal(dtype, cat_dtypes[0]) for dtype in cat_dtypes[1:] | ||
): | ||
return False | ||
else: | ||
return False | ||
|
||
def categorical_contains_values(categorical_dtype, x): | ||
unique_values = np.unique(x[~isna(x)]) | ||
if any( | ||
val not in categorical_dtype.categories for val in unique_values.tolist() | ||
): | ||
return False | ||
return True | ||
|
||
if not all( | ||
categorical_contains_values(to_cast[0].dtype, other) or len(other) == 0 | ||
for other in to_cast[1:] | ||
): | ||
raise ValueError( | ||
"Cannot concat on a Categorical with a new category, " | ||
"set the categories first" | ||
) | ||
|
||
return True | ||
|
||
|
||
def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): | ||
""" | ||
provide concatenation of an array of arrays each of which is a single | ||
|
@@ -108,7 +173,17 @@ def is_nonempty(x) -> bool: | |
# we ignore axis here, as internally concatting with EAs is always | ||
# for axis=0 | ||
if not single_dtype: | ||
target_dtype = find_common_type([x.dtype for x in to_concat]) | ||
# Special case for handling concat with categorical series. | ||
# We need to make sure that categorical dtype is preserved | ||
# when an array of valid values is given (GH#25383) | ||
if ( | ||
isinstance(to_concat[0], ExtensionArray) | ||
and all(x.shape[0] == 1 for x in to_concat[1:]) | ||
and _can_cast_to_categorical(to_concat) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this a very complicated implementation. This should all be in `find_common_type`` , but should be much simpler that this. either the dtypes are the same or they are not. changing them is not in scope for this issue. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, scope is more clear to me now. I will revert back to the previous approach and adapt it to raise on |
||
): | ||
target_dtype = to_concat[0].dtype | ||
else: | ||
target_dtype = find_common_type([x.dtype for x in to_concat]) | ||
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] | ||
|
||
if isinstance(to_concat[0], ExtensionArray): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import pytest | ||
|
||
from pandas.core.dtypes.concat import _can_cast_to_categorical | ||
|
||
import pandas as pd | ||
from pandas import Categorical | ||
import pandas._testing as tm | ||
|
||
|
||
class TestCategoricalSeries: | ||
def test_setitem_undefined_category_raises(self): | ||
ser = pd.Series(Categorical(["a", "b", "c"])) | ||
msg = ( | ||
"Cannot setitem on a Categorical with a new category, " | ||
"set the categories first" | ||
) | ||
with pytest.raises(ValueError, match=msg): | ||
ser.loc[2] = "d" | ||
|
||
def test_concat_undefined_category_raises(self): | ||
ser = pd.Series(Categorical(["a", "b", "c"])) | ||
msg = ( | ||
"Cannot concat on a Categorical with a new category, " | ||
"set the categories first" | ||
) | ||
with pytest.raises(ValueError, match=msg): | ||
ser.loc[3] = "d" | ||
|
||
def test_loc_category_dtype_retention(self): | ||
# Case 1 | ||
df = pd.DataFrame( | ||
{ | ||
"int": [0, 1, 2], | ||
"cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), | ||
} | ||
) | ||
df.loc[3] = [3, "c"] | ||
expected = pd.DataFrame( | ||
{ | ||
"int": [0, 1, 2, 3], | ||
"cat": Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), | ||
} | ||
) | ||
tm.assert_frame_equal(df, expected) | ||
|
||
# Case 2 | ||
ser = pd.Series(Categorical(["a", "b", "c"])) | ||
ser.loc[3] = "c" | ||
expected = pd.Series(Categorical(["a", "b", "c", "c"])) | ||
tm.assert_series_equal(ser, expected) | ||
|
||
# Case 3 | ||
ser = pd.Series(Categorical([1, 2, 3])) | ||
ser.loc[3] = 3 | ||
expected = pd.Series(Categorical([1, 2, 3, 3])) | ||
tm.assert_series_equal(ser, expected) | ||
|
||
# Case 4 | ||
ser = pd.Series(Categorical([1, 2, 3])) | ||
ser.loc[3] = pd.NA | ||
expected = pd.Series(Categorical([1, 2, 3, pd.NA])) | ||
tm.assert_series_equal(ser, expected) | ||
|
||
def test_can_cast_to_categorical(self): | ||
# Case 1: | ||
# Series of identical categorical dtype should | ||
# be able to concat to categorical | ||
ser1 = pd.Series(Categorical(["a", "b", "c"])) | ||
ser2 = pd.Series(Categorical(["a", "b", "c"])) | ||
arr = [ser1, ser2] | ||
assert _can_cast_to_categorical(arr) is True | ||
|
||
# Case 2: | ||
# Series of non-identical categorical dtype should | ||
# not be able to concat to categoorical | ||
ser1 = pd.Series(Categorical(["a", "b", "c"])) | ||
ser2 = pd.Series(Categorical(["a", "b", "d"])) | ||
arr = [ser1, ser2] | ||
assert _can_cast_to_categorical(arr) is False | ||
|
||
# Concat of a categorical series with a series | ||
# containing only values identical to the | ||
# categorical values should be possible | ||
|
||
# Case 3: For string categorical values | ||
ser1 = pd.Series(Categorical(["a", "b", "c"])) | ||
ser2 = pd.Series(["a", "a", "b"]) | ||
arr = [ser1, ser2] | ||
assert _can_cast_to_categorical(arr) is True | ||
|
||
# Case 4: For int categorical values | ||
ser1 = pd.Series(Categorical([1, 2, 3])) | ||
ser2 = pd.Series([1, 2]) | ||
arr = [ser1, ser2] | ||
assert _can_cast_to_categorical(arr) is True | ||
|
||
# The rest should raise because not all values | ||
# are present in the categorical. | ||
|
||
# Case 5 | ||
ser1 = pd.Series(Categorical([1, 2, 3])) | ||
ser2 = pd.Series([3, 4]) | ||
arr = [ser1, ser2] | ||
msg = ( | ||
"Cannot concat on a Categorical with a new category, " | ||
"set the categories first" | ||
) | ||
with pytest.raises(ValueError, match=msg): | ||
_can_cast_to_categorical(arr) | ||
|
||
# Case 6 | ||
ser1 = pd.Series(Categorical(["a", "b", "c"])) | ||
ser2 = pd.Series(["d", "e"]) | ||
arr = [ser1, ser2] | ||
with pytest.raises(ValueError, match=msg): | ||
_can_cast_to_categorical(arr) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I couldn't find a better way to detect when the
concat_compat
function is called through index expansion, so in cases like this:With the latest commit we are raising a
ValueError
when an invalid value is added to the categorical through index expansion. it also enables the index expansion of a categorical of anydtype
.