From 589d88dcd3ef84356eb2ea74f6574b37d979d58d Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 23 Jul 2016 12:10:21 +0900 Subject: [PATCH 1/2] ENH: concat and append now can handleunordered categories --- doc/source/categorical.rst | 55 ++- doc/source/merging.rst | 32 +- doc/source/whatsnew/v0.19.0.txt | 34 +- pandas/core/frame.py | 10 +- pandas/core/internals.py | 25 +- pandas/core/series.py | 10 +- pandas/tests/series/test_combine_concat.py | 4 +- pandas/tests/test_categorical.py | 186 +++++------ pandas/tools/merge.py | 39 ++- pandas/tools/tests/test_concat.py | 371 +++++++++++++++++++++ pandas/types/concat.py | 70 ++-- 11 files changed, 653 insertions(+), 183 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index d59ad68c9ea83..ccbda68f3a822 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -675,12 +675,57 @@ be lexsorted, use ``sort_categories=True`` argument. union_categoricals([a, b], sort_categories=True) -.. note:: +``union_categoricals`` also works with the "easy" case of combining two +categoricals of the same categories and order information +(e.g. what you could also ``append`` for). + +.. ipython:: python + + a = pd.Categorical(["a", "b"], ordered=True) + b = pd.Categorical(["a", "b", "a"], ordered=True) + union_categoricals([a, b]) + +The below raises ``TypeError`` because the categories are ordered and not identical. + +.. code-block:: ipython + + In [1]: a = pd.Categorical(["a", "b"], ordered=True) + In [2]: b = pd.Categorical(["a", "b", "c"], ordered=True) + In [3]: union_categoricals([a, b]) + Out[3]: + TypeError: to union ordered Categoricals, all categories must be the same + +.. _categorical.concat: + +Concatenation +~~~~~~~~~~~~~ + +This section describes concatenations specific to ``category`` dtype. See :ref:`Concatenating objects` for general description. + +By default, ``Series`` or ``DataFrame`` concatenation which contains different +categories results in ``object`` dtype. + +.. ipython:: python + + s1 = pd.Series(['a', 'b'], dtype='category') + s2 = pd.Series(['b', 'c'], dtype='category') + pd.concat([s1, s2]) + +Specifying ``union_categoricals=True`` allows to concat categories following +``union_categoricals`` rule. + +.. ipython:: python + + pd.concat([s1, s2], union_categoricals=True) + +Following table summarizes the results of ``Categoricals`` related concatenations. - In addition to the "easy" case of combining two categoricals of the same - categories and order information (e.g. what you could also ``append`` for), - ``union_categoricals`` only works with unordered categoricals and will - raise if any are ordered. +| arg1 | arg2 | default | ``union_categoricals=True`` | +|---------|-------------------------------------------|---------|------------------------------| +| category | category (identical categories) | category | category | +| category | category (different categories, both not ordered) | object (dtype is inferred) | category | +| category | category (different categories, either one is ordered) | object (dtype is inferred) | object (dtype is inferred) | +| category | not category | object (dtype is inferred) | object (dtype is inferred) Getting Data In/Out ------------------- diff --git a/doc/source/merging.rst b/doc/source/merging.rst index f14e5741c6e2e..f56188dfb10f9 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -78,34 +78,40 @@ some configurable handling of "what to do with the other axes": :: pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False) + keys=None, levels=None, names=None, verify_integrity=False, + union_categoricals=False, copy=True) -- ``objs``: a sequence or mapping of Series, DataFrame, or Panel objects. If a +- ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. -- ``axis``: {0, 1, ...}, default 0. The axis to concatenate along. -- ``join``: {'inner', 'outer'}, default 'outer'. How to handle indexes on +- ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. +- ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on other axis(es). Outer for union and inner for intersection. -- ``join_axes``: list of Index objects. Specific indexes to use for the other +- ``ignore_index`` : boolean, default False. If True, do not use the index + values on the concatenation axis. The resulting axis will be labeled 0, ..., + n - 1. This is useful if you are concatenating objects where the + concatenation axis does not have meaningful indexing information. Note + the index values on the other axes are still respected in the join. +- ``join_axes`` : list of Index objects. Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic. -- ``keys``: sequence, default None. Construct hierarchical index using the +- ``keys`` : sequence, default None. Construct hierarchical index using the passed keys as the outermost level. If multiple levels passed, should contain tuples. - ``levels`` : list of sequences, default None. Specific levels (unique values) to use for constructing a MultiIndex. Otherwise they will be inferred from the keys. -- ``names``: list, default None. Names for the levels in the resulting +- ``names`` : list, default None. Names for the levels in the resulting hierarchical index. -- ``verify_integrity``: boolean, default False. Check whether the new +- ``verify_integrity`` : boolean, default False. Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. -- ``ignore_index`` : boolean, default False. If True, do not use the index - values on the concatenation axis. The resulting axis will be labeled 0, ..., - n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note - the index values on the other axes are still respected in the join. +- ``union_categoricals`` : boolean, default False. + If True, use union_categoricals rule to concat category dtype. + If False, category dtype is kept if both categories are identical, + otherwise results in object dtype. + See :ref:`Categoricals Concatenation` for detail. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. Without a little bit of context and example many of these arguments don't make diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2d93652ca91db..342aeb1f58e71 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -15,6 +15,8 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` +- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`here ` - pandas development api, see :ref:`here ` - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See :ref:`here ` - Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` @@ -277,6 +279,37 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) df['col3'] +.. _whatsnew_0190.enhancements.union_categoricals: + +Categorical Concatenation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +- A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846`) + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +- ``concat`` and ``append`` now can concat unordered ``category`` dtypes using ``union_categorical`` internally. (:issue:`13524`) + + By default, different categories results in ``object`` dtype. + + .. ipython:: python + + s1 = pd.Series(['a', 'b'], dtype='category') + s2 = pd.Series(['b', 'c'], dtype='category') + pd.concat([s1, s2]) + + Specifying ``union_categoricals=True`` allows to concat categories following + ``union_categoricals`` rule. + + .. ipython:: python + + pd.concat([s1, s2], union_categoricals=True) + .. _whatsnew_0190.enhancements.semi_month_offsets: Semi-Month Offsets @@ -448,7 +481,6 @@ Other enhancements - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, :issue:`13846`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ac3e5d2aabef7..188365701fd3c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4322,7 +4322,8 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False): + def append(self, other, ignore_index=False, verify_integrity=False, + union_categoricals=False): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -4335,6 +4336,10 @@ def append(self, other, ignore_index=False, verify_integrity=False): If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. + union_categoricals : bool, default False + If True, use union_categoricals rule to concat category dtype. + If False, category dtype is kept if both categories are identical, + otherwise results in object dtype. Returns ------- @@ -4411,7 +4416,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): else: to_concat = [self, other] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + union_categoricals=union_categoricals) def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bb2d1a9d1b5d3..c31a92055ad71 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1144,7 +1144,7 @@ def get_result(other): return self._try_coerce_result(result) # error handler if we have an issue operating with the function - def handle_error(): + def handle_error(detail): if raise_on_error: raise TypeError('Could not operate %s with block values %s' % @@ -1165,7 +1165,7 @@ def handle_error(): except ValueError as detail: raise except Exception as detail: - result = handle_error() + result = handle_error(detail) # technically a broadcast error in numpy can 'work' by returning a # boolean False @@ -4771,7 +4771,8 @@ def _putmask_smart(v, m, n): return nv -def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, + copy, union_categoricals=False): """ Concatenate block managers into one. @@ -4781,16 +4782,20 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): axes : list of Index concat_axis : int copy : bool + union_categoricals : bool, default False + If True, use union_categoricals rule to concat CategoricalBlock. + If False, CategoricalBlock is kept if both categories are + identical, otherwise results in ObjectBlock. """ concat_plan = combine_concat_plans( [get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers], concat_axis) - blocks = [make_block(concatenate_join_units(join_units, concat_axis, - copy=copy), - placement=placement) - for placement, join_units in concat_plan] + blocks = [make_block( + concatenate_join_units(join_units, concat_axis, copy=copy, + union_categoricals=union_categoricals), + placement=placement) for placement, join_units in concat_plan] return BlockManager(blocks, axes) @@ -4875,7 +4880,8 @@ def get_empty_dtype_and_na(join_units): raise AssertionError("invalid dtype determination in get_concat_dtype") -def concatenate_join_units(join_units, concat_axis, copy): +def concatenate_join_units(join_units, concat_axis, copy, + union_categoricals=False): """ Concatenate values from several join units along selected axis. """ @@ -4895,7 +4901,8 @@ def concatenate_join_units(join_units, concat_axis, copy): if copy and concat_values.base is not None: concat_values = concat_values.copy() else: - concat_values = _concat._concat_compat(to_concat, axis=concat_axis) + concat_values = _concat._concat_compat( + to_concat, axis=concat_axis, union_categoricals=union_categoricals) return concat_values diff --git a/pandas/core/series.py b/pandas/core/series.py index 8379c8bcdcae8..f8df0d7ce6901 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1525,7 +1525,8 @@ def searchsorted(self, v, side='left', sorter=None): # ------------------------------------------------------------------- # Combination - def append(self, to_append, ignore_index=False, verify_integrity=False): + def append(self, to_append, ignore_index=False, verify_integrity=False, + union_categoricals=False): """ Concatenate two or more Series. @@ -1539,6 +1540,10 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates + union_categoricals : bool, default False + If True, use union_categoricals rule to concat category dtype. + If False, category dtype is kept if both categories are identical, + otherwise results in object dtype. Returns ------- @@ -1592,7 +1597,8 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): else: to_concat = [self, to_append] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + union_categoricals=union_categoricals) def _binop(self, other, func, level=None, fill_value=None): """ diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index fd6fd90cd631f..23261c2ef79e2 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -185,9 +185,9 @@ def test_concat_empty_series_dtypes(self): 'category') self.assertEqual(pd.concat([Series(dtype='category'), Series(dtype='float64')]).dtype, - np.object_) + 'float64') self.assertEqual(pd.concat([Series(dtype='category'), - Series(dtype='object')]).dtype, 'category') + Series(dtype='object')]).dtype, 'object') # sparse result = pd.concat([Series(dtype='float64').to_sparse(), Series( diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index eabd118de671d..a2babf852d993 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2088,8 +2088,8 @@ def test_series_functions_no_warnings(self): def test_assignment_to_dataframe(self): # assignment - df = DataFrame({'value': np.array( - np.random.randint(0, 10000, 100), dtype='int32')}) + df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100), + dtype='int32')}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] df = df.sort_values(by=['value'], ascending=True) @@ -3355,16 +3355,15 @@ def test_slicing_and_getting_ops(self): def test_slicing_doc_examples(self): # GH 7918 - cats = Categorical( - ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]) + cats = Categorical(["a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n", ]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( - {"cats": Categorical( - ['b', 'b'], categories=['a', 'b', 'c']), + {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']), "values": [2, 2]}, index=['j', 'k']) tm.assert_frame_equal(result, expected) @@ -3379,10 +3378,9 @@ def test_slicing_doc_examples(self): tm.assert_series_equal(result, expected) result = df.ix["h":"j", 0:1] - expected = DataFrame({'cats': Series( - Categorical( - ['a', 'b', 'b'], categories=['a', 'b', 'c']), index=['h', 'i', - 'j'])}) + expected = DataFrame({'cats': Categorical(['a', 'b', 'b'], + categories=['a', 'b', 'c'])}, + index=['h', 'i', 'j']) tm.assert_frame_equal(result, expected) def test_assigning_ops(self): @@ -3636,8 +3634,8 @@ def f(): with tm.assertRaises(ValueError): # different values df = orig.copy() - df.ix["j":"k", 0] = pd.Categorical( - ["c", "c"], categories=["a", "b", "c"]) + df.ix["j":"k", 0] = pd.Categorical(["c", "c"], + categories=["a", "b", "c"]) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3674,8 +3672,8 @@ def f(): self.assertRaises(ValueError, f) # fancy indexing - catsf = pd.Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]) + catsf = pd.Categorical(["a", "a", "c", "c", "a", "a", "a"], + categories=["a", "b", "c"]) idxf = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = pd.DataFrame({"cats": catsf, "values": valuesf}, index=idxf) @@ -3733,9 +3731,8 @@ def f(): s = orig.copy() s.index = ["x", "y"] s["y"] = "a" - exp = Series( - pd.Categorical(["b", "a"], - categories=["a", "b"]), index=["x", "y"]) + exp = Series(pd.Categorical(["b", "a"], categories=["a", "b"]), + index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan @@ -3887,7 +3884,7 @@ def test_cat_equality(self): self.assertRaises(TypeError, lambda: a > b) self.assertRaises(TypeError, lambda: b > a) - def test_concat(self): + def test_concat_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] df = pd.DataFrame({"cats": cat, "vals": vals}) @@ -3896,20 +3893,32 @@ def test_concat(self): exp = pd.DataFrame({"cats": cat2, "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - res = pd.concat([df, df]) - tm.assert_frame_equal(exp, res) + tm.assert_frame_equal(pd.concat([df, df]), exp) + tm.assert_frame_equal(df.append(df), exp) - # Concat should raise if the two categoricals do not have the same - # categories + # GH 13524 can concat different categories using union_categoricals cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) + df_different_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - def f(): - pd.concat([df, df_wrong_categories]) + res = pd.concat([df, df_different_categories], ignore_index=True) + exp = pd.DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) - self.assertRaises(ValueError, f) + res = df.append(df_different_categories, ignore_index=True) + tm.assert_frame_equal(res, exp) + + res = pd.concat([df, df_different_categories], ignore_index=True, + union_categoricals=True) + exp_cat = pd.Categorical(list('abab'), categories=["a", "b", "c"]) + exp = pd.DataFrame({"cats": exp_cat, "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) + res = df.append(df_different_categories, ignore_index=True, + union_categoricals=True) + tm.assert_frame_equal(res, exp) + + def test_concat_append_gh7864(self): # GH 7864 # make sure ordering is preserverd df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], @@ -3926,41 +3935,53 @@ def f(): df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) - dfx['grade'].cat.categories self.assert_index_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + dfa = df1.append(df2) + self.assert_index_equal(df['grade'].cat.categories, + dfa['grade'].cat.categories) + + def test_concat_preserve(self): - # GH 8641 - # series concat not preserving category dtype + # GH 8641 series concat not preserving category dtype + # GH 13524 can concat different categories using union_categoricals s = Series(list('abc'), dtype='category') s2 = Series(list('abd'), dtype='category') - def f(): - pd.concat([s, s2]) + exp = Series(list('abcabd')) + res = pd.concat([s, s2], ignore_index=True) + tm.assert_series_equal(res, exp) - self.assertRaises(ValueError, f) + exp = Series(Categorical(list('abcabd'), categories=list('abcd'))) + res = pd.concat([s, s2], ignore_index=True, union_categoricals=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s], ignore_index=True) - expected = Series(list('abcabc')).astype('category') - tm.assert_series_equal(result, expected) + exp = Series(list('abcabc'), dtype='category') + res = pd.concat([s, s], ignore_index=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s]) - expected = Series( - list('abcabc'), index=[0, 1, 2, 0, 1, 2]).astype('category') - tm.assert_series_equal(result, expected) + res = pd.concat([s, s], ignore_index=True, union_categoricals=True) + tm.assert_series_equal(res, exp) + + exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2], + dtype='category') + res = pd.concat([s, s]) + tm.assert_series_equal(res, exp) + result = pd.concat([s, s], union_categoricals=True) + tm.assert_series_equal(res, exp) a = Series(np.arange(6, dtype='int64')) b = Series(list('aabbca')) df2 = DataFrame({'A': a, 'B': b.astype('category', categories=list('cab'))}) - result = pd.concat([df2, df2]) - expected = DataFrame({'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype( - 'category', categories=list('cab'))}) - tm.assert_frame_equal(result, expected) + res = pd.concat([df2, df2]) + exp = DataFrame({'A': pd.concat([a, a]), + 'B': pd.concat([b, b]).astype( + 'category', categories=list('cab'))}) + tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): @@ -3968,44 +3989,21 @@ def test_categorical_index_preserver(self): b = Series(list('aabbca')) df2 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'cab'))}).set_index('B') + 'B': b.astype('category', categories=list('cab')) + }).set_index('B') result = pd.concat([df2, df2]) expected = DataFrame({'A': pd.concat([a, a]), 'B': pd.concat([b, b]).astype( - 'category', categories=list( - 'cab'))}).set_index('B') + 'category', categories=list('cab')) + }).set_index('B') tm.assert_frame_equal(result, expected) # wrong catgories df3 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'abc'))}).set_index('B') + 'B': pd.Categorical(b, categories=list('abc')) + }).set_index('B') self.assertRaises(TypeError, lambda: pd.concat([df2, df3])) - def test_append(self): - cat = pd.Categorical(["a", "b"], categories=["a", "b"]) - vals = [1, 2] - df = pd.DataFrame({"cats": cat, "vals": vals}) - cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"]) - vals2 = [1, 2, 1, 2] - exp = pd.DataFrame({"cats": cat2, - "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - - res = df.append(df) - tm.assert_frame_equal(exp, res) - - # Concat should raise if the two categoricals do not have the same - # categories - cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) - vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - - def f(): - df.append(df_wrong_categories) - - self.assertRaises(ValueError, f) - def test_merge(self): # GH 9426 @@ -4470,28 +4468,26 @@ def test_dt_accessor_api_for_categorical(self): def test_concat_categorical(self): # See GH 10177 - df1 = pd.DataFrame( - np.arange(18, dtype='int64').reshape(6, - 3), columns=["a", "b", "c"]) - - df2 = pd.DataFrame( - np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"]) - df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two", - "two", "one"])) - - df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True) - - df_expected = pd.DataFrame( - {'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan], - 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]}) - df_expected['h'] = pd.Series(pd.Categorical( - [None, None, None, None, None, None, "one", "one", "two", "one", - "two", "two", "one"])) - - tm.assert_frame_equal(df_expected, df_concat) - + df1 = pd.DataFrame(np.arange(18, dtype='int64').reshape(6, 3), + columns=["a", "b", "c"]) + + df2 = pd.DataFrame(np.arange(14, dtype='int64').reshape(7, 2), + columns=["a", "c"]) + + cat_values = ["one", "one", "two", "one", "two", "two", "one"] + df2['h'] = pd.Series(pd.Categorical(cat_values)) + + res = pd.concat((df1, df2), axis=0, ignore_index=True) + exp = pd.DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], + 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + 'h': [None] * 6 + cat_values}) + tm.assert_frame_equal(res, exp) + + res = pd.concat((df1, df2), axis=0, ignore_index=True, + union_categoricals=True) + tm.assert_frame_equal(res, exp) class TestCategoricalSubclassing(tm.TestCase): diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 7a29918c55658..cfbf465e9a5a7 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1269,7 +1269,7 @@ def _get_join_keys(llab, rlab, shape, sort): def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - copy=True): + union_categoricals=False, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. Can also add a layer of hierarchical indexing on the @@ -1290,9 +1290,12 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, join_axes : list of Index objects Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic - verify_integrity : boolean, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation + ignore_index : boolean, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. keys : sequence, default None If multiple levels passed, should contain tuples. Construct hierarchical index using the passed keys as the outermost level @@ -1301,12 +1304,13 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, MultiIndex. Otherwise they will be inferred from the keys names : list, default None Names for the levels in the resulting hierarchical index - ignore_index : boolean, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. + verify_integrity : boolean, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation + union_categoricals : boolean, default False + If True, use ``union_categoricals`` to concat category dtype. + If False, category dtype is kept if both categories are identical, + otherwise results in object dtype. copy : boolean, default True If False, do not copy data unnecessarily @@ -1322,6 +1326,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, + union_categoricals=union_categoricals, copy=copy) return op.get_result() @@ -1333,7 +1338,8 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): + ignore_index=False, verify_integrity=False, + union_categoricals=False, copy=True): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -1459,6 +1465,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.ignore_index = ignore_index self.verify_integrity = verify_integrity + self.union_categoricals = union_categoricals self.copy = copy self.new_axes = self._get_new_axes() @@ -1476,7 +1483,8 @@ def get_result(self): values = [x._values for x in non_empties] else: values = [x._values for x in self.objs] - new_data = _concat._concat_compat(values) + new_data = _concat._concat_compat( + values, union_categoricals=self.union_categoricals) name = com._consensus_name_attr(self.objs) cons = _concat._get_series_result_type(new_data) @@ -1512,10 +1520,9 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) - new_data = concatenate_block_managers(mgrs_indexers, - self.new_axes, - concat_axis=self.axis, - copy=self.copy) + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, + union_categoricals=self.union_categoricals, copy=self.copy) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 102f21bcdc535..7d293cf19ab0f 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -450,6 +450,377 @@ def test_concatlike_common_period_mixed_dt_to_object(self): res = pd.concat([tds, ps1]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + def test_concat_categorical(self): + # GH 13524 + + # same categories -> category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category') + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1], dtype='category') + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories with union_categoricals => category + exp = pd.Series(pd.Categorical([3, 2, 2, 1], categories=[2, 3, 1])) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), exp) + + # completelly different categories (same dtype) => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') + + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # completelly different categories (same dtype) with union_categoricals + # => category + exp = pd.Series(pd.Categorical([10, 11, np.nan, np.nan, 1, 3, 2], + categories=[10, 11, 1, 2, 3])) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), exp) + + def test_concat_categorical_coercion(self): + # GH 13524 + + # category + not-category => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), exp) + + # result shouldn't be affected by 1st elem dtype + exp = pd.Series([2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True, + union_categoricals=True), exp) + + # all values are not in category => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1]) + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([2, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completelly different categories => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([1, 3, 2]) + + exp = pd.Series([10, 11, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([1, 3, 2, 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completelly different categories with union_categoricals + # => not-category + exp = pd.Series([10, 11, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), exp) + + exp = pd.Series([1, 3, 2, 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True, + union_categoricals=True), exp) + + # different dtype => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series(['a', 'b', 'c']) + + exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c']) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # if normal series only contains NaN-likes => not-category + s1 = pd.Series([10, 11], dtype='category') + s2 = pd.Series([np.nan, np.nan, np.nan]) + + exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), exp) + + exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True, + union_categoricals=True), exp) + + def test_concat_categorical_3elem_coercion(self): + # GH 13524 + + # mixed dtypes => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + s3 = pd.Series([1, 2, 1, 2, np.nan]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True, + union_categoricals=True), exp) + + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True, + union_categoricals=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([1, 3, 4]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True, + union_categoricals=True), exp) + + exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True, + union_categoricals=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([10, 11, 12]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + def test_concat_categorical_multi_coercion(self): + # GH 13524 + + s1 = pd.Series([1, 3], dtype='category') + s2 = pd.Series([3, 4], dtype='category') + s3 = pd.Series([2, 3]) + s4 = pd.Series([2, 2], dtype='category') + s5 = pd.Series([1, np.nan]) + s6 = pd.Series([1, 3, 2], dtype='category') + + # mixed dtype, values are all in categories => not-category + exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + + # mixed dtypes => not-category + exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True, + union_categoricals=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True, + union_categoricals=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True, + union_categoricals=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True, + union_categoricals=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_ordered(self): + # GH 13524 + + s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True)) + s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True)) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + res = pd.concat([s1, s2], ignore_index=True, union_categoricals=True) + tm.assert_series_equal(res, exp) + res = s1.append(s2, ignore_index=True, union_categoricals=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], + ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + res = pd.concat([s1, s2, s1], ignore_index=True, + union_categoricals=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s1], ignore_index=True, union_categoricals=True) + tm.assert_series_equal(res, exp) + + s3 = pd.Series(pd.Categorical([2, 1, 2], categories=[2, 1], + ordered=True)) + + res = pd.concat([s1, s3], ignore_index=True, union_categoricals=True) + exp = pd.Series([1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(res, exp) + + res = s1.append(s3, ignore_index=True, union_categoricals=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_coercion_nan(self): + # GH 13524 + + # some edge cases + # category + not-category => not category + s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), + dtype='category') + s2 = pd.Series([np.nan, 1]) + + exp = pd.Series([np.nan, np.nan, np.nan, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + s1 = pd.Series([1, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([1, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), exp) + + # mixed dtype, all nan-likes => not-category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, + union_categoricals=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True, + union_categoricals=True), exp) + + # all category nan-likes => category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan], dtype='category') + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_empty(self): + # GH 13524 + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([1, 2], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([]) + + # different dtype => not-category + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, + union_categoricals=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True, + union_categoricals=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, + union_categoricals=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True, + union_categoricals=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + # empty Series is ignored + exp = pd.Series([np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + class TestAppend(ConcatenateBase): diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 29a0fe7d9f8d0..81b70ce7c63bf 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -5,7 +5,6 @@ import numpy as np import pandas.tslib as tslib from pandas import compat -from pandas.compat import map from pandas.core.algorithms import take_1d from .common import (is_categorical_dtype, is_sparse, @@ -97,7 +96,7 @@ def _get_frame_result_type(result, objs): return objs[0] -def _concat_compat(to_concat, axis=0): +def _concat_compat(to_concat, axis=0, union_categoricals=False): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -133,19 +132,22 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) - # these are mandated to handle empties as well _contains_datetime = any(typ.startswith('datetime') for typ in typs) _contains_period = any(typ.startswith('period') for typ in typs) - if _contains_datetime or 'timedelta' in typs or _contains_period: + if 'category' in typs: + # this must be priort to _concat_datetime, + # to support Categorical + datetime-like + return _concat_categorical(to_concat, axis=axis, + _union_categoricals=union_categoricals) + + elif _contains_datetime or 'timedelta' in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) + # these are mandated to handle empties as well elif 'sparse' in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) - elif 'category' in typs: - return _concat_categorical(to_concat, axis=axis) - if not nonempty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise @@ -164,7 +166,7 @@ def is_nonempty(x): return np.concatenate(to_concat, axis=axis) -def _concat_categorical(to_concat, axis=0): +def _concat_categorical(to_concat, axis=0, _union_categoricals=False): """Concatenate an object/categorical array of arrays, each of which is a single dtype @@ -181,18 +183,14 @@ def _concat_categorical(to_concat, axis=0): A single array, preserving the combined dtypes """ - from pandas.core.categorical import Categorical - - def convert_categorical(x): - # coerce to object dtype - if is_categorical_dtype(x.dtype): - return x.get_values() - return x.ravel() - - if get_dtype_kinds(to_concat) - set(['object', 'category']): - # convert to object type and perform a regular concat - return _concat_compat([np.array(x, copy=False, dtype=object) - for x in to_concat], axis=0) + def _concat_asobject(to_concat): + to_concat = [x.get_values() if is_categorical_dtype(x.dtype) + else x.ravel() for x in to_concat] + res = _concat_compat(to_concat) + if axis == 1: + return res.reshape(1, len(res)) + else: + return res # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything @@ -200,25 +198,21 @@ def convert_categorical(x): categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories - categories = categoricals[0] - rawcats = categories.categories - for x in categoricals[1:]: - if not categories.is_dtype_equal(x): - raise ValueError("incompatible categories in categorical concat") - - # we've already checked that all categoricals are the same, so if their - # length is equal to the input then we have all the same categories - if len(categoricals) == len(to_concat): - # concating numeric types is much faster than concating object types - # and fastpath takes a shorter path through the constructor - return Categorical(np.concatenate([x.codes for x in to_concat], - axis=0), - rawcats, ordered=categoricals[0].ordered, - fastpath=True) + if len(categoricals) != len(to_concat): + pass + elif _union_categoricals: + try: + # this trial may fail in ordered categories + return union_categoricals(categoricals) + except TypeError: + pass else: - concatted = np.concatenate(list(map(convert_categorical, to_concat)), - axis=0) - return Categorical(concatted, rawcats) + # when all categories are identical + first = to_concat[0] + if all(first.is_dtype_equal(other) for other in to_concat[1:]): + return union_categoricals(categoricals) + + return _concat_asobject(to_concat) def union_categoricals(to_union, sort_categories=False): From 96a372e09005c29eba935b9d1778537cbfae50c9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 7 Sep 2016 07:52:36 +0900 Subject: [PATCH 2/2] reomove union_categoricals kw from concat --- doc/source/categorical.rst | 29 +++---- doc/source/merging.rst | 7 +- doc/source/whatsnew/v0.19.0.txt | 49 ++++++------ pandas/core/frame.py | 10 +-- pandas/core/internals.py | 20 ++--- pandas/core/series.py | 10 +-- pandas/tests/test_categorical.py | 26 +------ pandas/tools/merge.py | 16 +--- pandas/tools/tests/test_concat.py | 123 ------------------------------ pandas/types/concat.py | 13 +--- 10 files changed, 61 insertions(+), 242 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index ccbda68f3a822..59ddfe602c033 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -702,30 +702,33 @@ Concatenation This section describes concatenations specific to ``category`` dtype. See :ref:`Concatenating objects` for general description. -By default, ``Series`` or ``DataFrame`` concatenation which contains different -categories results in ``object`` dtype. +By default, ``Series`` or ``DataFrame`` concatenation which contains the same categories +results in ``category`` dtype, otherwise results in ``object`` dtype. +Use ``.astype`` or ``union_categoricals`` to get ``category`` result. .. ipython:: python + # same categories s1 = pd.Series(['a', 'b'], dtype='category') - s2 = pd.Series(['b', 'c'], dtype='category') + s2 = pd.Series(['a', 'b', 'a'], dtype='category') pd.concat([s1, s2]) -Specifying ``union_categoricals=True`` allows to concat categories following -``union_categoricals`` rule. + # different categories + s3 = pd.Series(['b', 'c'], dtype='category') + pd.concat([s1, s3]) -.. ipython:: python + pd.concat([s1, s3]).astype('category') + union_categoricals([s1.values, s3.values]) - pd.concat([s1, s2], union_categoricals=True) Following table summarizes the results of ``Categoricals`` related concatenations. -| arg1 | arg2 | default | ``union_categoricals=True`` | -|---------|-------------------------------------------|---------|------------------------------| -| category | category (identical categories) | category | category | -| category | category (different categories, both not ordered) | object (dtype is inferred) | category | -| category | category (different categories, either one is ordered) | object (dtype is inferred) | object (dtype is inferred) | -| category | not category | object (dtype is inferred) | object (dtype is inferred) +| arg1 | arg2 | result | +|---------|-------------------------------------------|---------| +| category | category (identical categories) | category | +| category | category (different categories, both not ordered) | object (dtype is inferred) | +| category | category (different categories, either one is ordered) | object (dtype is inferred) | +| category | not category | object (dtype is inferred) | Getting Data In/Out ------------------- diff --git a/doc/source/merging.rst b/doc/source/merging.rst index f56188dfb10f9..c6541a26c72b4 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -79,7 +79,7 @@ some configurable handling of "what to do with the other axes": pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - union_categoricals=False, copy=True) + copy=True) - ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless @@ -107,11 +107,6 @@ some configurable handling of "what to do with the other axes": - ``verify_integrity`` : boolean, default False. Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. -- ``union_categoricals`` : boolean, default False. - If True, use union_categoricals rule to concat category dtype. - If False, category dtype is kept if both categories are identical, - otherwise results in object dtype. - See :ref:`Categoricals Concatenation` for detail. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. Without a little bit of context and example many of these arguments don't make diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 342aeb1f58e71..725c9561cc0b8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -220,7 +220,7 @@ they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :is data = '0,1,2\n3,4,5' names = ['a', 'b', 'a'] -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -233,7 +233,7 @@ Previous behaviour: The first ``a`` column contains the same data as the second ``a`` column, when it should have contained the values ``[0, 3]``. -New behaviour: +New Behavior: .. ipython :: python @@ -293,22 +293,23 @@ Categorical Concatenation b = pd.Categorical(["a", "b"]) union_categoricals([a, b]) -- ``concat`` and ``append`` now can concat unordered ``category`` dtypes using ``union_categorical`` internally. (:issue:`13524`) +- ``concat`` and ``append`` now can concat ``category`` dtypes wifht different +``categories`` as ``object`` dtype (:issue:`13524`) - By default, different categories results in ``object`` dtype. +Previous Behavior: - .. ipython:: python + .. code-block:: ipython - s1 = pd.Series(['a', 'b'], dtype='category') - s2 = pd.Series(['b', 'c'], dtype='category') - pd.concat([s1, s2]) + In [1]: s1 = pd.Series(['a', 'b'], dtype='category') + In [2]: s2 = pd.Series(['b', 'c'], dtype='category') + In [3]: pd.concat([s1, s2]) + ValueError: incompatible categories in categorical concat - Specifying ``union_categoricals=True`` allows to concat categories following - ``union_categoricals`` rule. +New Behavior: .. ipython:: python - pd.concat([s1, s2], union_categoricals=True) + pd.concat([s1, s2]) .. _whatsnew_0190.enhancements.semi_month_offsets: @@ -411,11 +412,11 @@ get_dummies dtypes The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`). This should provide an improved memory footprint. -Previous behaviour: +Previous Behavior: .. code-block:: ipython - In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes Out[1]: a float64 @@ -437,7 +438,7 @@ Other enhancements - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the :ref:`docs ` for more details (:issue:`13577`). -- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) .. ipython:: python @@ -544,7 +545,7 @@ API changes ``Series.tolist()`` will now return Python types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behaviour (:issue:`10904`) +``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behavior (:issue:`10904`) .. ipython:: python @@ -579,7 +580,7 @@ including ``DataFrame`` (:issue:`1134`, :issue:`4581`, :issue:`13538`) .. warning:: Until 0.18.1, comparing ``Series`` with the same length, would succeed even if - the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behaviour or align different indexes, using the flexible comparison methods like ``.eq``. + the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behavior or align different indexes, using the flexible comparison methods like ``.eq``. As a result, ``Series`` and ``DataFrame`` operators behave as below: @@ -647,7 +648,7 @@ Logical operators Logical operators align both ``.index``. -Previous Behavior (``Series``), only left hand side ``index`` is kept: +Previous behavior (``Series``), only left hand side ``index`` is kept: .. code-block:: ipython @@ -966,7 +967,7 @@ Index ``+`` / ``-`` no longer used for set operations Addition and subtraction of the base Index type (not the numeric subclasses) previously performed set operations (set union and difference). This -behaviour was already deprecated since 0.15.0 (in favor using the specific +behavior was already deprecated since 0.15.0 (in favor using the specific ``.union()`` and ``.difference()`` methods), and is now disabled. When possible, ``+`` and ``-`` are now used for element-wise operations, for example for concatenating strings (:issue:`8227`, :issue:`14127`). @@ -986,13 +987,13 @@ The same operation will now perform element-wise addition: pd.Index(['a', 'b']) + pd.Index(['a', 'c']) Note that numeric Index objects already performed element-wise operations. -For example, the behaviour of adding two integer Indexes: +For example, the behavior of adding two integer Indexes: .. ipython:: python pd.Index([1, 2, 3]) + pd.Index([2, 3, 4]) -is unchanged. The base ``Index`` is now made consistent with this behaviour. +is unchanged. The base ``Index`` is now made consistent with this behavior. .. _whatsnew_0190.api.difference: @@ -1143,7 +1144,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument. data = 'A,B\n0,1\n2,3\n4,5\n6,7' -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -1155,7 +1156,7 @@ Previous behaviour: 0 4 5 1 6 7 -New behaviour: +New Behavior: .. ipython :: python @@ -1281,7 +1282,7 @@ These types are the same on many platform, but for 64 bit python on Windows, ``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many operations on that platform. -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -1290,7 +1291,7 @@ Previous behaviour: In [2]: i.get_indexer(['b', 'b', 'c']).dtype Out[2]: dtype('int32') -New behaviour: +New Behavior: .. code-block:: ipython diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 188365701fd3c..ac3e5d2aabef7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4322,8 +4322,7 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False, - union_categoricals=False): + def append(self, other, ignore_index=False, verify_integrity=False): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -4336,10 +4335,6 @@ def append(self, other, ignore_index=False, verify_integrity=False, If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. - union_categoricals : bool, default False - If True, use union_categoricals rule to concat category dtype. - If False, category dtype is kept if both categories are identical, - otherwise results in object dtype. Returns ------- @@ -4416,8 +4411,7 @@ def append(self, other, ignore_index=False, verify_integrity=False, else: to_concat = [self, other] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity, - union_categoricals=union_categoricals) + verify_integrity=verify_integrity) def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c31a92055ad71..9a1c7864903d7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1144,7 +1144,7 @@ def get_result(other): return self._try_coerce_result(result) # error handler if we have an issue operating with the function - def handle_error(detail): + def handle_error(): if raise_on_error: raise TypeError('Could not operate %s with block values %s' % @@ -1165,7 +1165,7 @@ def handle_error(detail): except ValueError as detail: raise except Exception as detail: - result = handle_error(detail) + result = handle_error() # technically a broadcast error in numpy can 'work' by returning a # boolean False @@ -4771,8 +4771,7 @@ def _putmask_smart(v, m, n): return nv -def concatenate_block_managers(mgrs_indexers, axes, concat_axis, - copy, union_categoricals=False): +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): """ Concatenate block managers into one. @@ -4782,10 +4781,6 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, axes : list of Index concat_axis : int copy : bool - union_categoricals : bool, default False - If True, use union_categoricals rule to concat CategoricalBlock. - If False, CategoricalBlock is kept if both categories are - identical, otherwise results in ObjectBlock. """ concat_plan = combine_concat_plans( @@ -4793,8 +4788,7 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, for mgr, indexers in mgrs_indexers], concat_axis) blocks = [make_block( - concatenate_join_units(join_units, concat_axis, copy=copy, - union_categoricals=union_categoricals), + concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement) for placement, join_units in concat_plan] return BlockManager(blocks, axes) @@ -4880,8 +4874,7 @@ def get_empty_dtype_and_na(join_units): raise AssertionError("invalid dtype determination in get_concat_dtype") -def concatenate_join_units(join_units, concat_axis, copy, - union_categoricals=False): +def concatenate_join_units(join_units, concat_axis, copy): """ Concatenate values from several join units along selected axis. """ @@ -4901,8 +4894,7 @@ def concatenate_join_units(join_units, concat_axis, copy, if copy and concat_values.base is not None: concat_values = concat_values.copy() else: - concat_values = _concat._concat_compat( - to_concat, axis=concat_axis, union_categoricals=union_categoricals) + concat_values = _concat._concat_compat(to_concat, axis=concat_axis) return concat_values diff --git a/pandas/core/series.py b/pandas/core/series.py index f8df0d7ce6901..8379c8bcdcae8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1525,8 +1525,7 @@ def searchsorted(self, v, side='left', sorter=None): # ------------------------------------------------------------------- # Combination - def append(self, to_append, ignore_index=False, verify_integrity=False, - union_categoricals=False): + def append(self, to_append, ignore_index=False, verify_integrity=False): """ Concatenate two or more Series. @@ -1540,10 +1539,6 @@ def append(self, to_append, ignore_index=False, verify_integrity=False, verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates - union_categoricals : bool, default False - If True, use union_categoricals rule to concat category dtype. - If False, category dtype is kept if both categories are identical, - otherwise results in object dtype. Returns ------- @@ -1597,8 +1592,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False, else: to_concat = [self, to_append] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity, - union_categoricals=union_categoricals) + verify_integrity=verify_integrity) def _binop(self, other, func, level=None, fill_value=None): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a2babf852d993..c4ddd2c0981d9 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3896,7 +3896,7 @@ def test_concat_append(self): tm.assert_frame_equal(pd.concat([df, df]), exp) tm.assert_frame_equal(df.append(df), exp) - # GH 13524 can concat different categories using union_categoricals + # GH 13524 can concat different categories cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) vals3 = [1, 2] df_different_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) @@ -3908,16 +3908,6 @@ def test_concat_append(self): res = df.append(df_different_categories, ignore_index=True) tm.assert_frame_equal(res, exp) - res = pd.concat([df, df_different_categories], ignore_index=True, - union_categoricals=True) - exp_cat = pd.Categorical(list('abab'), categories=["a", "b", "c"]) - exp = pd.DataFrame({"cats": exp_cat, "vals": [1, 2, 1, 2]}) - tm.assert_frame_equal(res, exp) - - res = df.append(df_different_categories, ignore_index=True, - union_categoricals=True) - tm.assert_frame_equal(res, exp) - def test_concat_append_gh7864(self): # GH 7864 # make sure ordering is preserverd @@ -3946,7 +3936,7 @@ def test_concat_append_gh7864(self): def test_concat_preserve(self): # GH 8641 series concat not preserving category dtype - # GH 13524 can concat different categories using union_categoricals + # GH 13524 can concat different categories s = Series(list('abc'), dtype='category') s2 = Series(list('abd'), dtype='category') @@ -3954,23 +3944,14 @@ def test_concat_preserve(self): res = pd.concat([s, s2], ignore_index=True) tm.assert_series_equal(res, exp) - exp = Series(Categorical(list('abcabd'), categories=list('abcd'))) - res = pd.concat([s, s2], ignore_index=True, union_categoricals=True) - tm.assert_series_equal(res, exp) - exp = Series(list('abcabc'), dtype='category') res = pd.concat([s, s], ignore_index=True) tm.assert_series_equal(res, exp) - res = pd.concat([s, s], ignore_index=True, union_categoricals=True) - tm.assert_series_equal(res, exp) - exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2], dtype='category') res = pd.concat([s, s]) tm.assert_series_equal(res, exp) - result = pd.concat([s, s], union_categoricals=True) - tm.assert_series_equal(res, exp) a = Series(np.arange(6, dtype='int64')) b = Series(list('aabbca')) @@ -4485,9 +4466,6 @@ def test_concat_categorical(self): 'h': [None] * 6 + cat_values}) tm.assert_frame_equal(res, exp) - res = pd.concat((df1, df2), axis=0, ignore_index=True, - union_categoricals=True) - tm.assert_frame_equal(res, exp) class TestCategoricalSubclassing(tm.TestCase): diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index cfbf465e9a5a7..6521acbd0b733 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1269,7 +1269,7 @@ def _get_join_keys(llab, rlab, shape, sort): def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - union_categoricals=False, copy=True): + copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. Can also add a layer of hierarchical indexing on the @@ -1307,10 +1307,6 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, verify_integrity : boolean, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation - union_categoricals : boolean, default False - If True, use ``union_categoricals`` to concat category dtype. - If False, category dtype is kept if both categories are identical, - otherwise results in object dtype. copy : boolean, default True If False, do not copy data unnecessarily @@ -1326,7 +1322,6 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, - union_categoricals=union_categoricals, copy=copy) return op.get_result() @@ -1338,8 +1333,7 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, - union_categoricals=False, copy=True): + ignore_index=False, verify_integrity=False, copy=True): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -1465,7 +1459,6 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.ignore_index = ignore_index self.verify_integrity = verify_integrity - self.union_categoricals = union_categoricals self.copy = copy self.new_axes = self._get_new_axes() @@ -1483,8 +1476,7 @@ def get_result(self): values = [x._values for x in non_empties] else: values = [x._values for x in self.objs] - new_data = _concat._concat_compat( - values, union_categoricals=self.union_categoricals) + new_data = _concat._concat_compat(values) name = com._consensus_name_attr(self.objs) cons = _concat._get_series_result_type(new_data) @@ -1522,7 +1514,7 @@ def get_result(self): new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, - union_categoricals=self.union_categoricals, copy=self.copy) + copy=self.copy) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 7d293cf19ab0f..8e20cfa83c405 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -469,13 +469,6 @@ def test_concat_categorical(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - # partially different categories with union_categoricals => category - exp = pd.Series(pd.Categorical([3, 2, 2, 1], categories=[2, 3, 1])) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), exp) - # completelly different categories (same dtype) => not-category s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') @@ -484,15 +477,6 @@ def test_concat_categorical(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - # completelly different categories (same dtype) with union_categoricals - # => category - exp = pd.Series(pd.Categorical([10, 11, np.nan, np.nan, 1, 3, 2], - categories=[10, 11, 1, 2, 3])) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), exp) - def test_concat_categorical_coercion(self): # GH 13524 @@ -504,21 +488,11 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), exp) - # result shouldn't be affected by 1st elem dtype exp = pd.Series([2, 1, 2, 1, 2, np.nan]) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True, - union_categoricals=True), exp) - # all values are not in category => not-category s1 = pd.Series([3, 2], dtype='category') s2 = pd.Series([2, 1]) @@ -543,20 +517,6 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - # completelly different categories with union_categoricals - # => not-category - exp = pd.Series([10, 11, np.nan, 1, 3, 2]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), exp) - - exp = pd.Series([1, 3, 2, 10, 11, np.nan]) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True, - union_categoricals=True), exp) - # different dtype => not-category s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series(['a', 'b', 'c']) @@ -576,18 +536,10 @@ def test_concat_categorical_coercion(self): exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), exp) exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True, - union_categoricals=True), exp) def test_concat_categorical_3elem_coercion(self): # GH 13524 @@ -600,18 +552,10 @@ def test_concat_categorical_3elem_coercion(self): exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan]) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True, - union_categoricals=True), exp) exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2]) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True, - union_categoricals=True), exp) # values are all in either category => not-category s1 = pd.Series([4, 5, 6], dtype='category') @@ -621,18 +565,10 @@ def test_concat_categorical_3elem_coercion(self): exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True, - union_categoricals=True), exp) exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True, - union_categoricals=True), exp) # values are all in either category => not-category s1 = pd.Series([4, 5, 6], dtype='category') @@ -670,23 +606,6 @@ def test_concat_categorical_multi_coercion(self): res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) tm.assert_series_equal(res, exp) - # mixed dtypes => not-category - exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) - res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True, - union_categoricals=True) - tm.assert_series_equal(res, exp) - res = s1.append([s2, s3, s4, s5, s6], ignore_index=True, - union_categoricals=True) - tm.assert_series_equal(res, exp) - - exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) - res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True, - union_categoricals=True) - tm.assert_series_equal(res, exp) - res = s6.append([s5, s4, s3, s2, s1], ignore_index=True, - union_categoricals=True) - tm.assert_series_equal(res, exp) - def test_concat_categorical_ordered(self): # GH 13524 @@ -696,30 +615,11 @@ def test_concat_categorical_ordered(self): exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - res = pd.concat([s1, s2], ignore_index=True, union_categoricals=True) - tm.assert_series_equal(res, exp) - res = s1.append(s2, ignore_index=True, union_categoricals=True) - tm.assert_series_equal(res, exp) exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True)) tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) - res = pd.concat([s1, s2, s1], ignore_index=True, - union_categoricals=True) - tm.assert_series_equal(res, exp) - res = s1.append([s2, s1], ignore_index=True, union_categoricals=True) - tm.assert_series_equal(res, exp) - - s3 = pd.Series(pd.Categorical([2, 1, 2], categories=[2, 1], - ordered=True)) - - res = pd.concat([s1, s3], ignore_index=True, union_categoricals=True) - exp = pd.Series([1, 2, np.nan, 2, 1, 2]) - tm.assert_series_equal(res, exp) - - res = s1.append(s3, ignore_index=True, union_categoricals=True) - tm.assert_series_equal(res, exp) def test_concat_categorical_coercion_nan(self): # GH 13524 @@ -741,11 +641,6 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), exp) - # mixed dtype, all nan-likes => not-category s1 = pd.Series([np.nan, np.nan], dtype='category') s2 = pd.Series([np.nan, np.nan]) @@ -756,15 +651,6 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, - union_categoricals=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True, - union_categoricals=True), exp) - # all category nan-likes => category s1 = pd.Series([np.nan, np.nan], dtype='category') s2 = pd.Series([np.nan, np.nan], dtype='category') @@ -801,15 +687,6 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True, - union_categoricals=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True, - union_categoricals=True), s2) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True, - union_categoricals=True), s2) - tm.assert_series_equal(s2.append(s1, ignore_index=True, - union_categoricals=True), s2) - s1 = pd.Series([], dtype='category') s2 = pd.Series([np.nan, np.nan]) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 81b70ce7c63bf..8bdd71348a537 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -96,7 +96,7 @@ def _get_frame_result_type(result, objs): return objs[0] -def _concat_compat(to_concat, axis=0, union_categoricals=False): +def _concat_compat(to_concat, axis=0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -138,8 +138,7 @@ def is_nonempty(x): if 'category' in typs: # this must be priort to _concat_datetime, # to support Categorical + datetime-like - return _concat_categorical(to_concat, axis=axis, - _union_categoricals=union_categoricals) + return _concat_categorical(to_concat, axis=axis) elif _contains_datetime or 'timedelta' in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) @@ -166,7 +165,7 @@ def is_nonempty(x): return np.concatenate(to_concat, axis=axis) -def _concat_categorical(to_concat, axis=0, _union_categoricals=False): +def _concat_categorical(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype @@ -200,12 +199,6 @@ def _concat_asobject(to_concat): # validate the categories if len(categoricals) != len(to_concat): pass - elif _union_categoricals: - try: - # this trial may fail in ordered categories - return union_categoricals(categoricals) - except TypeError: - pass else: # when all categories are identical first = to_concat[0]