From 221bd87bc228827dec941e8a895dac5da18b6a5f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 May 2021 19:34:52 -0700 Subject: [PATCH 1/6] tests not passing but i need to rebase again --- pandas/core/dtypes/concat.py | 14 ++++++++++++++ pandas/core/indexes/category.py | 8 +++++++- pandas/tests/base/test_value_counts.py | 21 ++++++++------------- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b0d00775bbed1..5999d7e13b743 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -125,6 +125,20 @@ def is_nonempty(x) -> bool: if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 + if any(is_categorical_dtype(x.dtype) for x in to_concat): + first = [x for x in to_concat if is_categorical_dtype(x.dtype)][0] + from pandas import Index + try: + codes = np.concatenate([Index(first)._is_dtype_compat(Index(c)).codes for c in to_concat]) + except TypeError: + # not all to_concat elements are among our categories (or NA) + pass + else: + cat = first._from_backing_data(codes) + if first.ordered: + cat = cat.as_ordered() + return cat + if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat] diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7339c82cbcc77..52a9244bcafe1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -548,6 +548,8 @@ def map(self, mapper): return Index(mapped, name=self.name) def _concat(self, to_concat: list[Index], name: Hashable) -> Index: + alt = Index._concat(self, to_concat, name=name) # uses concat_compat + # if calling index is category, don't check dtype of others try: codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) @@ -556,7 +558,11 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: from pandas.core.dtypes.concat import concat_compat res = concat_compat(to_concat) - return Index(res, name=name) + out = Index(res, name=name) + assert out.equals(alt) + assert out.dtype == alt.dtype + return out else: cat = self._data._from_backing_data(codes) + assert cat.dtype == alt.dtype return type(self)._simple_new(cat, name=name) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 10f391a49d98f..cc591523dea0a 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -43,7 +43,8 @@ def test_value_counts(index_or_series_obj): @pytest.mark.parametrize("null_obj", [np.nan, None]) -def test_value_counts_null(null_obj, index_or_series_obj): +@pytest.mark.parametrize("dropna", [True, False]) +def test_value_counts_null(null_obj, dropna, index_or_series_obj): orig = index_or_series_obj obj = orig.copy() @@ -70,20 +71,14 @@ def test_value_counts_null(null_obj, index_or_series_obj): expected = Series(dict(counter.most_common()), dtype=np.int64) expected.index = expected.index.astype(obj.dtype) - result = obj.value_counts() - if obj.duplicated().any(): - # TODO: - # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() - tm.assert_series_equal(result, expected) + if not dropna: + # can't use expected[null_obj] = 3 as + # IntervalIndex doesn't allow assignment + new_entry = Series({np.nan: 3}, dtype=np.int64) + expected = expected.append(new_entry) - # can't use expected[null_obj] = 3 as - # IntervalIndex doesn't allow assignment - new_entry = Series({np.nan: 3}, dtype=np.int64) - expected = expected.append(new_entry) + result = obj.value_counts(dropna=dropna) - result = obj.value_counts(dropna=False) if obj.duplicated().any(): # TODO: # Order of entries with the same count is inconsistent on CI (gh-32449) From 1f45dbd28fecbaded6beb5ceb740bb74e419d5c4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 4 Aug 2021 14:29:23 -0700 Subject: [PATCH 2/6] API: make concat_compat behave like CategoricalIndex._concat --- pandas/core/dtypes/concat.py | 20 ++++++--- pandas/core/indexes/category.py | 23 +--------- .../tests/indexes/categorical/test_append.py | 5 ++- .../reshape/concat/test_append_common.py | 43 ++++++++++--------- pandas/tests/reshape/concat/test_empty.py | 6 +-- 5 files changed, 45 insertions(+), 52 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 5999d7e13b743..542bd8357ef25 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -125,17 +125,27 @@ def is_nonempty(x) -> bool: if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 - if any(is_categorical_dtype(x.dtype) for x in to_concat): - first = [x for x in to_concat if is_categorical_dtype(x.dtype)][0] - from pandas import Index + cats = [x for x in to_concat if is_categorical_dtype(x.dtype)] + if len(cats): + # TODO: Ideally this shouldn't be order-dependent + first = cats[0] + from pandas import ( + CategoricalIndex, + Index, + ) + + ci = CategoricalIndex(first) + try: - codes = np.concatenate([Index(first)._is_dtype_compat(Index(c)).codes for c in to_concat]) + codes = np.concatenate( + [ci._is_dtype_compat(Index(c)).codes for c in to_concat] + ) except TypeError: # not all to_concat elements are among our categories (or NA) pass else: cat = first._from_backing_data(codes) - if first.ordered: + if all(x.dtype.ordered for x in cats): cat = cat.as_ordered() return cat diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 52a9244bcafe1..6d00a5cd3cc7c 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -278,7 +278,8 @@ def _is_dtype_compat(self, other) -> Categorical: "categories must match existing categories when appending" ) - return other + # TODO: this is a lot like the non-coercing constructor + return other.astype(self.dtype, copy=False) def equals(self, other: object) -> bool: """ @@ -546,23 +547,3 @@ def map(self, mapper): """ mapped = self._values.map(mapper) return Index(mapped, name=self.name) - - def _concat(self, to_concat: list[Index], name: Hashable) -> Index: - alt = Index._concat(self, to_concat, name=name) # uses concat_compat - - # if calling index is category, don't check dtype of others - try: - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) - except TypeError: - # not all to_concat elements are among our categories (or NA) - from pandas.core.dtypes.concat import concat_compat - - res = concat_compat(to_concat) - out = Index(res, name=name) - assert out.equals(alt) - assert out.dtype == alt.dtype - return out - else: - cat = self._data._from_backing_data(codes) - assert cat.dtype == alt.dtype - return type(self)._simple_new(cat, name=name) diff --git a/pandas/tests/indexes/categorical/test_append.py b/pandas/tests/indexes/categorical/test_append.py index b48c3219f5111..2a99f72de947b 100644 --- a/pandas/tests/indexes/categorical/test_append.py +++ b/pandas/tests/indexes/categorical/test_append.py @@ -48,9 +48,10 @@ def test_append_non_categories(self, ci): tm.assert_index_equal(result, expected, exact=True) def test_append_object(self, ci): - # GH#14298 - if base object is not categorical -> coerce to object + # GH#14298 - if base object and all entries are among + # categories -> cast to categorical (GH#41626) result = Index(["c", "a"]).append(ci) - expected = Index(list("caaabbca")) + expected = Index(list("caaabbca"), dtype=ci.dtype) tm.assert_index_equal(result, expected, exact=True) def test_append_to_another(self): diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index b8b254e786194..1bcf2b631e9b8 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -511,18 +511,18 @@ def test_union_categorical_same_categories_different_order(self): tm.assert_series_equal(result, expected) def test_concat_categorical_coercion(self): - # GH 13524 + # GH 13524, GH#41626 - # category + not-category => not-category + # category + not-category (but all-castable/nan) => category s1 = Series([1, 2, np.nan], dtype="category") s2 = Series([2, 1, 2]) - exp = Series([1, 2, np.nan, 2, 1, 2], dtype="object") + exp = Series([1, 2, np.nan, 2, 1, 2], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype - exp = Series([2, 1, 2, 1, 2, np.nan], dtype="object") + exp = Series([2, 1, 2, 1, 2, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) @@ -562,31 +562,31 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - # if normal series only contains NaN-likes => not-category + # if normal series only contains NaN-likes => category (GH#41626) s1 = Series([10, 11], dtype="category") s2 = Series([np.nan, np.nan, np.nan]) - exp = Series([10, 11, np.nan, np.nan, np.nan]) + exp = Series([10, 11, np.nan, np.nan, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = Series([np.nan, np.nan, np.nan, 10, 11]) + exp = Series([np.nan, np.nan, np.nan, 10, 11], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) def test_concat_categorical_3elem_coercion(self): - # GH 13524 + # GH 13524, GH#41626 - # mixed dtypes => not-category + # mixed dtypes, all castable to our categories => category (GH#41626) s1 = Series([1, 2, np.nan], dtype="category") s2 = Series([2, 1, 2], dtype="category") s3 = Series([1, 2, 1, 2, np.nan]) - exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") + exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") + exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) @@ -654,7 +654,7 @@ def test_concat_categorical_ordered(self): tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) def test_concat_categorical_coercion_nan(self): - # GH 13524 + # GH 13524, GH#41626 # some edge cases # category + not-category => not category @@ -665,18 +665,19 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + # all elements of s2 are nan => category (GH#41626) s1 = Series([1, np.nan], dtype="category") s2 = Series([np.nan, np.nan]) - exp = Series([1, np.nan, np.nan, np.nan], dtype="float") + exp = Series([1, np.nan, np.nan, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - # mixed dtype, all nan-likes => not-category + # mixed dtype, all nan-likes => category (GH#41626) s1 = Series([np.nan, np.nan], dtype="category") s2 = Series([np.nan, np.nan]) - exp = Series([np.nan, np.nan, np.nan, np.nan]) + exp = Series([np.nan, np.nan, np.nan, np.nan], dtype=s1.dtype) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) @@ -692,7 +693,7 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) def test_concat_categorical_empty(self): - # GH 13524 + # GH 13524, GH#41626 s1 = Series([], dtype="category") s2 = Series([1, 2], dtype="category") @@ -712,11 +713,11 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([], dtype="object") - # different dtype => not-category - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + # different dtype, but all castable (bc empty) => category (GH#41626) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s1) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s1) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s1) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s1) s1 = Series([], dtype="category") s2 = Series([np.nan, np.nan]) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 63482dbc1502c..966a0883eaa4b 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -105,7 +105,7 @@ def test_concat_empty_series_timelike(self, tz, values): ("M8[ns]", np.int64, np.object_), # categorical ("category", "category", "category"), - ("category", "object", "object"), + ("category", "object", "category"), # GH#41626 ], ) def test_concat_empty_series_dtypes(self, left, right, expected): @@ -177,12 +177,12 @@ def test_concat_empty_series_dtypes_triple(self): ) def test_concat_empty_series_dtype_category_with_array(self): - # GH#18515 + # GH#18515, GH#41626 assert ( concat( [Series(np.array([]), dtype="category"), Series(dtype="float64")] ).dtype - == "float64" + == "category" ) def test_concat_empty_series_dtypes_sparse(self): From abdd2a8c0a85594a00997654c72ce0c3bd8d6482 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Nov 2021 12:57:01 -0800 Subject: [PATCH 3/6] avoid FutureWarning --- pandas/core/dtypes/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 39b462021557c..c58ec320dd40e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -134,7 +134,7 @@ def is_nonempty(x) -> bool: try: codes = np.concatenate( - [ci._is_dtype_compat(Index(c)).codes for c in to_concat] + [ci._is_dtype_compat(Index._with_infer(c)).codes for c in to_concat] ) except TypeError: # not all to_concat elements are among our categories (or NA) From 9164016475bf959f7c4f2f372349f3389c19b673 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 Jan 2022 11:50:39 -0800 Subject: [PATCH 4/6] avoid append --- pandas/tests/reshape/concat/test_append_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 25f5e1a309510..6a192f821c47f 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -727,9 +727,9 @@ def test_concat_categorical_empty(self): # different dtype, but all castable (bc empty) => category (GH#41626) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s1) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s1) + tm.assert_series_equal(s1._append(s2, ignore_index=True), s1) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s1) - tm.assert_series_equal(s2.append(s1, ignore_index=True), s1) + tm.assert_series_equal(s2._append(s1, ignore_index=True), s1) s1 = Series([], dtype="category") s2 = Series([np.nan, np.nan]) From 7ba8d2d2954ba710d81013041a33bf6c8ae79581 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 Jan 2022 11:56:08 -0800 Subject: [PATCH 5/6] catch warnings --- pandas/tests/arrays/sparse/test_combine_concat.py | 11 +++++++++-- pandas/tests/extension/test_sparse.py | 4 +++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py index 0f09af269148b..f18aab416a2ea 100644 --- a/pandas/tests/arrays/sparse/test_combine_concat.py +++ b/pandas/tests/arrays/sparse/test_combine_concat.py @@ -53,10 +53,17 @@ def test_concat_with_non_sparse(other, expected_dtype): # https://github.com/pandas-dev/pandas/issues/34336 s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0)) - result = pd.concat([s_sparse, other], ignore_index=True) + msg = "passing a SparseArray to pd.Index" + warn = FutureWarning + if isinstance(expected_dtype, pd.SparseDtype): + warn = None + + with tm.assert_produces_warning(warn, match=msg): + result = pd.concat([s_sparse, other], ignore_index=True) expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype) tm.assert_series_equal(result, expected) - result = pd.concat([other, s_sparse], ignore_index=True) + with tm.assert_produces_warning(warn, match=msg): + result = pd.concat([other, s_sparse], ignore_index=True) expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 3a37ea4d673af..5948db9ab5434 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -139,7 +139,9 @@ def test_concat_mixed_dtypes(self, data): dfs = [df1, df2, df3] # dataframes - result = pd.concat(dfs) + msg = "passing a SparseArray to pd.Index" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.concat(dfs) expected = pd.concat( [x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs] ) From 75ab0a0717dee79f4aa87c514929912087415a94 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 Jan 2022 19:56:16 -0800 Subject: [PATCH 6/6] ArrayManager compat --- pandas/tests/extension/test_sparse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5948db9ab5434..1fe046488eef5 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -130,7 +130,7 @@ class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): class TestReshaping(BaseSparseTests, base.BaseReshapingTests): - def test_concat_mixed_dtypes(self, data): + def test_concat_mixed_dtypes(self, data, using_array_manager): # https://github.com/pandas-dev/pandas/issues/20762 # This should be the same, aside from concat([sparse, float]) df1 = pd.DataFrame({"A": data[:3]}) @@ -140,7 +140,8 @@ def test_concat_mixed_dtypes(self, data): # dataframes msg = "passing a SparseArray to pd.Index" - with tm.assert_produces_warning(FutureWarning, match=msg): + warn = None if using_array_manager else FutureWarning + with tm.assert_produces_warning(warn, match=msg): result = pd.concat(dfs) expected = pd.concat( [x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs]