From c55966226603059dbebecf6d946d81029c5e668d Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 29 Jul 2016 06:23:43 -0400 Subject: [PATCH 1/5] ENH: add sort_categories argument to union_categoricals --- doc/source/categorical.rst | 10 ++++- pandas/tools/tests/test_concat.py | 36 ++++++++++++++++++ pandas/types/concat.py | 63 +++++++++++++++++-------------- 3 files changed, 80 insertions(+), 29 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index da9c707e07552..d59ad68c9ea83 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -656,7 +656,7 @@ Unioning .. versionadded:: 0.19.0 If you want to combine categoricals that do not necessarily have -the same categories, the `union_categorical` function will +the same categories, the ``union_categoricals`` function will combine a list-like of categoricals. The new categories will be the union of the categories being combined. @@ -667,6 +667,14 @@ will be the union of the categories being combined. b = pd.Categorical(["a", "b"]) union_categoricals([a, b]) +By default, the resulting categories will be ordered as +they appear in the data. If you want the categories to +be lexsorted, use ``sort_categories=True`` argument. + +.. ipython:: python + + union_categoricals([a, b], sort_categories=True) + .. note:: In addition to the "easy" case of combining two categoricals of the same diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index dd5b4936c70bb..bcc5bd5759263 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -989,6 +989,42 @@ def test_union_categoricals_ordered(self): with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) + def test_union_categoricals_sort(self): + # GH 13763 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'x', 'y', 'z']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['b', 'x']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index e860ba3e201e9..a02cb98811354 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -211,22 +211,23 @@ def convert_categorical(x): return Categorical(concatted, rawcats) -def union_categoricals(to_union): +def union_categoricals(to_union, sort_categories=False): """ Combine list-like of Categoricals, unioning categories. All - must have the same dtype, and none can be ordered. + categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals + sort_categories : boolean, default False + If true, resulting categories will be lexsorted, otherwise + they will be ordered as they appear in the data Returns ------- - Categorical - A single array, categories will be ordered as they - appear in the list + result : Categorical Raises ------ @@ -244,19 +245,39 @@ def union_categoricals(to_union): first = to_union[0] - if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) - for c in to_union): + if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) + for other in to_union[1:]): raise TypeError("dtype of categories must be the same") + ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): - return Categorical(np.concatenate([c.codes for c in to_union]), - categories=first.categories, ordered=first.ordered, - fastpath=True) + # identical categories - fastpath + categories = first.categories + ordered = first.ordered + new_codes = np.concatenate([c.codes for c in to_union]) + + if sort_categories: + categories = categories.sort_values() + indexer = first.categories.get_indexer(categories) + new_codes = take_1d(indexer, new_codes, fill_value=-1) elif all(not c.ordered for c in to_union): - # not ordered - pass + # different categories - union and recode + cats = first.categories.append([c.categories for c in to_union[1:]]) + categories = Index(cats.unique()) + if sort_categories: + categories = categories.sort_values() + + new_codes = [] + for c in to_union: + if len(c.categories) > 0: + indexer = categories.get_indexer(c.categories) + new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) + else: + # must be all NaN + new_codes.append(c.codes) + new_codes = np.concatenate(new_codes) else: - # to show a proper error message + # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") @@ -264,21 +285,7 @@ def union_categoricals(to_union): else: raise TypeError('Categorical.ordered must be the same') - cats = first.categories - unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() - categories = Index(unique_cats) - - new_codes = [] - for c in to_union: - if len(c.categories) > 0: - indexer = categories.get_indexer(c.categories) - new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) - else: - # must be all NaN - new_codes.append(c.codes) - - new_codes = np.concatenate(new_codes) - return Categorical(new_codes, categories=categories, ordered=False, + return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) From eea177711b62698ff7ecc267b4bc5b01563a9d20 Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 29 Jul 2016 17:16:23 -0500 Subject: [PATCH 2/5] skip r-esort when possible on fastpath --- pandas/tools/tests/test_concat.py | 8 ++++++++ pandas/types/concat.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index bcc5bd5759263..4764ec7cbcc7a 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -1006,6 +1006,14 @@ def test_union_categoricals_sort(self): categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + c1 = Categorical(['x', np.nan]) c2 = Categorical([np.nan, 'b']) result = union_categoricals([c1, c2], sort_categories=True) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index a02cb98811354..5f61f27507ee8 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -256,7 +256,7 @@ def union_categoricals(to_union, sort_categories=False): ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) - if sort_categories: + if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = first.categories.get_indexer(categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) From ecb2ae934baf22df763db83b15d3c2b1b56d71e4 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 2 Aug 2016 05:29:04 -0500 Subject: [PATCH 3/5] more tests; handle sorth with ordered --- pandas/tools/tests/test_concat.py | 59 ++++++++++++++++++++++++++++++- pandas/types/concat.py | 7 +++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 4764ec7cbcc7a..48516b6cfb14d 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -990,7 +990,7 @@ def test_union_categoricals_ordered(self): union_categoricals([c1, c2]) def test_union_categoricals_sort(self): - # GH 13763 + # GH 13846 c1 = Categorical(['x', 'y', 'z']) c2 = Categorical(['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) @@ -1033,6 +1033,63 @@ def test_union_categoricals_sort(self): expected = Categorical([]) tm.assert_categorical_equal(result, expected) + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([c1, c2], sort_categories=True) + + def test_union_categoricals_sort_false(self): + # GH 13846 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['b', 'a', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['x', 'b']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['b', 'a', 'a', 'c'], + categories=['b', 'a', 'c'], ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 5f61f27507ee8..0a985dd6141ae 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -223,7 +223,7 @@ def union_categoricals(to_union, sort_categories=False): to_union : list-like of Categoricals sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise - they will be ordered as they appear in the data + they will be ordered as they appear in the data. Returns ------- @@ -235,6 +235,7 @@ def union_categoricals(to_union, sort_categories=False): - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical + - sort_categories=True and Categoricals are ordered ValueError Emmpty list of categoricals passed """ @@ -256,6 +257,10 @@ def union_categoricals(to_union, sort_categories=False): ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) + if sort_categories and ordered: + raise TypeError("Cannot use sort_categories=True with " + "ordered Categoricals") + if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = first.categories.get_indexer(categories) From ff0bb5ea4126d70f9e6f5afa2cffb94fec452074 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 2 Aug 2016 05:49:24 -0500 Subject: [PATCH 4/5] add follow-up PRs to whatsnew --- doc/source/whatsnew/v0.19.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 62091d7ff03ff..20430d3d5cc54 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -336,7 +336,7 @@ Other enhancements - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846') - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) From 3a710f082fd2e5f94866418eb624d3e3ae391791 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 2 Aug 2016 17:48:07 -0500 Subject: [PATCH 5/5] lint fix --- pandas/tools/tests/test_concat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 48516b6cfb14d..968ea979f7c75 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -1089,7 +1089,6 @@ def test_union_categoricals_sort_false(self): categories=['b', 'a', 'c'], ordered=True) tm.assert_categorical_equal(result, expected) - def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2]