Skip to content

API: make concat_compat match CategoricalIndex._concat #42892

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
221bd87
tests not passing but i need to rebase again
jbrockmendel May 21, 2021
1f45dbd
API: make concat_compat behave like CategoricalIndex._concat
jbrockmendel Aug 4, 2021
d68f868
Merge branch 'master' into ci-concat
jbrockmendel Nov 9, 2021
4cc59c7
Merge branch 'master' into ci-concat
jbrockmendel Nov 23, 2021
53d3aea
Merge branch 'master' into ci-concat
jbrockmendel Nov 25, 2021
329d2ba
Merge branch 'master' into ci-concat
jbrockmendel Nov 26, 2021
abdd2a8
avoid FutureWarning
jbrockmendel Nov 26, 2021
6b9a75e
Merge branch 'master' into ci-concat
jbrockmendel Dec 5, 2021
ac7debd
Merge branch 'master' into ci-concat
jbrockmendel Dec 6, 2021
94976da
Merge branch 'master' into ci-concat
jbrockmendel Dec 12, 2021
6b2c55d
Merge branch 'master' into ci-concat
jbrockmendel Dec 26, 2021
bb6aa1a
Merge branch 'master' into ci-concat
jbrockmendel Dec 27, 2021
3120e51
Merge branch 'master' into ci-concat
jbrockmendel Dec 27, 2021
84fed7a
Merge branch 'master' into ci-concat
jbrockmendel Jan 1, 2022
8e58449
Merge branch 'master' into ci-concat
jbrockmendel Jan 1, 2022
8e9f60a
Merge branch 'master' into ci-concat
jbrockmendel Jan 6, 2022
9164016
avoid append
jbrockmendel Jan 6, 2022
7ba8d2d
catch warnings
jbrockmendel Jan 6, 2022
6daccec
Merge branch 'master' into ci-concat
jbrockmendel Jan 7, 2022
75ab0a0
ArrayManager compat
jbrockmendel Jan 7, 2022
1f6ac46
Merge branch 'master' into ci-concat
jbrockmendel Jan 8, 2022
16481a0
Merge branch 'master' into ci-concat
jbrockmendel Jan 11, 2022
881f47f
Merge branch 'main' into ci-concat
jbrockmendel Jan 13, 2022
ee5dd27
Merge branch 'main' into ci-concat
jbrockmendel Jan 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
find_common_type,
)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_dtype_equal,
is_sparse,
)
Expand Down Expand Up @@ -112,6 +113,30 @@ def is_nonempty(x) -> bool:
if any_ea:
# we ignore axis here, as internally concatting with EAs is always
# for axis=0
cats = [x for x in to_concat if is_categorical_dtype(x.dtype)]
if len(cats):
# TODO: Ideally this shouldn't be order-dependent
first = cats[0]
from pandas import (
CategoricalIndex,
Index,
)

ci = CategoricalIndex(first)

try:
codes = np.concatenate(
[ci._is_dtype_compat(Index._with_infer(c)).codes for c in to_concat]
)
except TypeError:
# not all to_concat elements are among our categories (or NA)
pass
else:
cat = first._from_backing_data(codes)
if all(x.dtype.ordered for x in cats):
cat = cat.as_ordered()
return cat

if not single_dtype:
target_dtype = find_common_type([x.dtype for x in to_concat])
target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
Expand Down
17 changes: 2 additions & 15 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ def _is_dtype_compat(self, other) -> Categorical:
"categories must match existing categories when appending"
)

return other
# TODO: this is a lot like the non-coercing constructor
return other.astype(self.dtype, copy=False)

@doc(Index.astype)
def astype(self, dtype: Dtype, copy: bool = True) -> Index:
Expand Down Expand Up @@ -567,17 +568,3 @@ def map(self, mapper):
"""
mapped = self._values.map(mapper)
return Index(mapped, name=self.name)

def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
# if calling index is category, don't check dtype of others
try:
codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat])
except TypeError:
# not all to_concat elements are among our categories (or NA)
from pandas.core.dtypes.concat import concat_compat

res = concat_compat([x._values for x in to_concat])
return Index(res, name=name)
else:
cat = self._data._from_backing_data(codes)
return type(self)._simple_new(cat, name=name)
11 changes: 9 additions & 2 deletions pandas/tests/arrays/sparse/test_combine_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,17 @@ def test_concat_with_non_sparse(other, expected_dtype):
# https://github.com/pandas-dev/pandas/issues/34336
s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))

result = pd.concat([s_sparse, other], ignore_index=True)
msg = "passing a SparseArray to pd.Index"
warn = FutureWarning
if isinstance(expected_dtype, pd.SparseDtype):
warn = None

with tm.assert_produces_warning(warn, match=msg):
result = pd.concat([s_sparse, other], ignore_index=True)
expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

result = pd.concat([other, s_sparse], ignore_index=True)
with tm.assert_produces_warning(warn, match=msg):
result = pd.concat([other, s_sparse], ignore_index=True)
expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
tm.assert_series_equal(result, expected)
18 changes: 7 additions & 11 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def test_value_counts(index_or_series_obj):


@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_value_counts_null(null_obj, index_or_series_obj):
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts_null(null_obj, dropna, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()

Expand All @@ -66,7 +67,11 @@ def test_value_counts_null(null_obj, index_or_series_obj):
expected = Series(dict(counter.most_common()), dtype=np.int64)
expected.index = expected.index.astype(obj.dtype)

result = obj.value_counts()
result = obj.value_counts(dropna=dropna)

if not dropna:
expected[null_obj] = 3

if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
Expand All @@ -76,16 +81,7 @@ def test_value_counts_null(null_obj, index_or_series_obj):
if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
expected = expected.astype("Int64")
tm.assert_series_equal(result, expected)

expected[null_obj] = 3

result = obj.value_counts(dropna=False)
if obj.duplicated().any():
# TODO(GH#32514):
# Order of entries with the same count is inconsistent on CI (gh-32449)
expected = expected.sort_index()
result = result.sort_index()
tm.assert_series_equal(result, expected)


Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class TestConstructors(BaseSparseTests, base.BaseConstructorsTests):


class TestReshaping(BaseSparseTests, base.BaseReshapingTests):
def test_concat_mixed_dtypes(self, data):
def test_concat_mixed_dtypes(self, data, using_array_manager):
# https://github.com/pandas-dev/pandas/issues/20762
# This should be the same, aside from concat([sparse, float])
df1 = pd.DataFrame({"A": data[:3]})
Expand All @@ -144,7 +144,10 @@ def test_concat_mixed_dtypes(self, data):
dfs = [df1, df2, df3]

# dataframes
result = pd.concat(dfs)
msg = "passing a SparseArray to pd.Index"
warn = None if using_array_manager else FutureWarning
with tm.assert_produces_warning(warn, match=msg):
result = pd.concat(dfs)
expected = pd.concat(
[x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs]
)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/indexes/categorical/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ def test_append_non_categories(self, ci):
tm.assert_index_equal(result, expected, exact=True)

def test_append_object(self, ci):
# GH#14298 - if base object is not categorical -> coerce to object
# GH#14298 - if base object and all entries are among
# categories -> cast to categorical (GH#41626)
result = Index(["c", "a"]).append(ci)
expected = Index(list("caaabbca"))
expected = Index(list("caaabbca"), dtype=ci.dtype)
tm.assert_index_equal(result, expected, exact=True)

def test_append_to_another(self):
Expand Down
43 changes: 22 additions & 21 deletions pandas/tests/reshape/concat/test_append_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,18 +523,18 @@ def test_union_categorical_same_categories_different_order(self):
tm.assert_series_equal(result, expected)

def test_concat_categorical_coercion(self):
# GH 13524
# GH 13524, GH#41626

# category + not-category => not-category
# category + not-category (but all-castable/nan) => category
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2])

exp = Series([1, 2, np.nan, 2, 1, 2], dtype=np.float64)
exp = Series([1, 2, np.nan, 2, 1, 2], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)

# result shouldn't be affected by 1st elem dtype
exp = Series([2, 1, 2, 1, 2, np.nan], dtype=np.float64)
exp = Series([2, 1, 2, 1, 2, np.nan], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)

Expand Down Expand Up @@ -574,31 +574,31 @@ def test_concat_categorical_coercion(self):
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)

# if normal series only contains NaN-likes => not-category
# if normal series only contains NaN-likes => category (GH#41626)
s1 = Series([10, 11], dtype="category")
s2 = Series([np.nan, np.nan, np.nan])

exp = Series([10, 11, np.nan, np.nan, np.nan])
exp = Series([10, 11, np.nan, np.nan, np.nan], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)

exp = Series([np.nan, np.nan, np.nan, 10, 11])
exp = Series([np.nan, np.nan, np.nan, 10, 11], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2._append(s1, ignore_index=True), exp)

def test_concat_categorical_3elem_coercion(self):
# GH 13524
# GH 13524, GH#41626

# mixed dtypes => not-category
# mixed dtypes, all castable to our categories => category (GH#41626)
s1 = Series([1, 2, np.nan], dtype="category")
s2 = Series([2, 1, 2], dtype="category")
s3 = Series([1, 2, 1, 2, np.nan])

exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float")
exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1._append([s2, s3], ignore_index=True), exp)

exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float")
exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3._append([s1, s2], ignore_index=True), exp)

Expand Down Expand Up @@ -666,7 +666,7 @@ def test_concat_categorical_ordered(self):
tm.assert_series_equal(s1._append([s2, s1], ignore_index=True), exp)

def test_concat_categorical_coercion_nan(self):
# GH 13524
# GH 13524, GH#41626

# some edge cases
# category + not-category => not category
Expand All @@ -677,18 +677,19 @@ def test_concat_categorical_coercion_nan(self):
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)

# all elements of s2 are nan => category (GH#41626)
s1 = Series([1, np.nan], dtype="category")
s2 = Series([np.nan, np.nan])

exp = Series([1, np.nan, np.nan, np.nan], dtype="float")
exp = Series([1, np.nan, np.nan, np.nan], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)

# mixed dtype, all nan-likes => not-category
# mixed dtype, all nan-likes => category (GH#41626)
s1 = Series([np.nan, np.nan], dtype="category")
s2 = Series([np.nan, np.nan])

exp = Series([np.nan, np.nan, np.nan, np.nan])
exp = Series([np.nan, np.nan, np.nan, np.nan], dtype=s1.dtype)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
Expand All @@ -704,7 +705,7 @@ def test_concat_categorical_coercion_nan(self):
tm.assert_series_equal(s1._append(s2, ignore_index=True), exp)

def test_concat_categorical_empty(self):
# GH 13524
# GH 13524, GH#41626

s1 = Series([], dtype="category")
s2 = Series([1, 2], dtype="category")
Expand All @@ -724,11 +725,11 @@ def test_concat_categorical_empty(self):
s1 = Series([], dtype="category")
s2 = Series([], dtype="object")

# different dtype => not-category
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s2)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2._append(s1, ignore_index=True), s2)
# different dtype, but all castable (bc empty) => category (GH#41626)
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s1)
tm.assert_series_equal(s1._append(s2, ignore_index=True), s1)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s1)
tm.assert_series_equal(s2._append(s1, ignore_index=True), s1)

s1 = Series([], dtype="category")
s2 = Series([np.nan, np.nan])
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/reshape/concat/test_empty.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_concat_empty_series_timelike(self, tz, values):
("M8[ns]", np.int64, np.object_),
# categorical
("category", "category", "category"),
("category", "object", "object"),
("category", "object", "category"), # GH#41626
],
)
def test_concat_empty_series_dtypes(self, left, right, expected):
Expand Down Expand Up @@ -182,12 +182,12 @@ def test_concat_empty_series_dtypes_triple(self):
)

def test_concat_empty_series_dtype_category_with_array(self):
# GH#18515
# GH#18515, GH#41626
assert (
concat(
[Series(np.array([]), dtype="category"), Series(dtype="float64")]
).dtype
== "float64"
== "category"
)

def test_concat_empty_series_dtypes_sparse(self):
Expand Down