diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index d59ad68c9ea83..59ddfe602c033 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -675,12 +675,60 @@ be lexsorted, use ``sort_categories=True`` argument. union_categoricals([a, b], sort_categories=True) -.. note:: +``union_categoricals`` also works with the "easy" case of combining two +categoricals of the same categories and order information +(e.g. what you could also ``append`` for). + +.. ipython:: python + + a = pd.Categorical(["a", "b"], ordered=True) + b = pd.Categorical(["a", "b", "a"], ordered=True) + union_categoricals([a, b]) + +The below raises ``TypeError`` because the categories are ordered and not identical. + +.. code-block:: ipython + + In [1]: a = pd.Categorical(["a", "b"], ordered=True) + In [2]: b = pd.Categorical(["a", "b", "c"], ordered=True) + In [3]: union_categoricals([a, b]) + Out[3]: + TypeError: to union ordered Categoricals, all categories must be the same + +.. _categorical.concat: + +Concatenation +~~~~~~~~~~~~~ + +This section describes concatenations specific to ``category`` dtype. See :ref:`Concatenating objects` for general description. + +By default, ``Series`` or ``DataFrame`` concatenation which contains the same categories +results in ``category`` dtype, otherwise results in ``object`` dtype. +Use ``.astype`` or ``union_categoricals`` to get ``category`` result. + +.. ipython:: python + + # same categories + s1 = pd.Series(['a', 'b'], dtype='category') + s2 = pd.Series(['a', 'b', 'a'], dtype='category') + pd.concat([s1, s2]) + + # different categories + s3 = pd.Series(['b', 'c'], dtype='category') + pd.concat([s1, s3]) + + pd.concat([s1, s3]).astype('category') + union_categoricals([s1.values, s3.values]) + + +Following table summarizes the results of ``Categoricals`` related concatenations. - In addition to the "easy" case of combining two categoricals of the same - categories and order information (e.g. what you could also ``append`` for), - ``union_categoricals`` only works with unordered categoricals and will - raise if any are ordered. +| arg1 | arg2 | result | +|---------|-------------------------------------------|---------| +| category | category (identical categories) | category | +| category | category (different categories, both not ordered) | object (dtype is inferred) | +| category | category (different categories, either one is ordered) | object (dtype is inferred) | +| category | not category | object (dtype is inferred) | Getting Data In/Out ------------------- diff --git a/doc/source/merging.rst b/doc/source/merging.rst index f14e5741c6e2e..c6541a26c72b4 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -78,34 +78,35 @@ some configurable handling of "what to do with the other axes": :: pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False) + keys=None, levels=None, names=None, verify_integrity=False, + copy=True) -- ``objs``: a sequence or mapping of Series, DataFrame, or Panel objects. If a +- ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. -- ``axis``: {0, 1, ...}, default 0. The axis to concatenate along. -- ``join``: {'inner', 'outer'}, default 'outer'. How to handle indexes on +- ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. +- ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on other axis(es). Outer for union and inner for intersection. -- ``join_axes``: list of Index objects. Specific indexes to use for the other +- ``ignore_index`` : boolean, default False. If True, do not use the index + values on the concatenation axis. The resulting axis will be labeled 0, ..., + n - 1. This is useful if you are concatenating objects where the + concatenation axis does not have meaningful indexing information. Note + the index values on the other axes are still respected in the join. +- ``join_axes`` : list of Index objects. Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic. -- ``keys``: sequence, default None. Construct hierarchical index using the +- ``keys`` : sequence, default None. Construct hierarchical index using the passed keys as the outermost level. If multiple levels passed, should contain tuples. - ``levels`` : list of sequences, default None. Specific levels (unique values) to use for constructing a MultiIndex. Otherwise they will be inferred from the keys. -- ``names``: list, default None. Names for the levels in the resulting +- ``names`` : list, default None. Names for the levels in the resulting hierarchical index. -- ``verify_integrity``: boolean, default False. Check whether the new +- ``verify_integrity`` : boolean, default False. Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. -- ``ignore_index`` : boolean, default False. If True, do not use the index - values on the concatenation axis. The resulting axis will be labeled 0, ..., - n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note - the index values on the other axes are still respected in the join. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. Without a little bit of context and example many of these arguments don't make diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2d93652ca91db..725c9561cc0b8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -15,6 +15,8 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` +- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`here ` - pandas development api, see :ref:`here ` - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See :ref:`here ` - Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` @@ -218,7 +220,7 @@ they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :is data = '0,1,2\n3,4,5' names = ['a', 'b', 'a'] -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -231,7 +233,7 @@ Previous behaviour: The first ``a`` column contains the same data as the second ``a`` column, when it should have contained the values ``[0, 3]``. -New behaviour: +New Behavior: .. ipython :: python @@ -277,6 +279,38 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) df['col3'] +.. _whatsnew_0190.enhancements.union_categoricals: + +Categorical Concatenation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +- A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846`) + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +- ``concat`` and ``append`` now can concat ``category`` dtypes wifht different +``categories`` as ``object`` dtype (:issue:`13524`) + +Previous Behavior: + + .. code-block:: ipython + + In [1]: s1 = pd.Series(['a', 'b'], dtype='category') + In [2]: s2 = pd.Series(['b', 'c'], dtype='category') + In [3]: pd.concat([s1, s2]) + ValueError: incompatible categories in categorical concat + +New Behavior: + + .. ipython:: python + + pd.concat([s1, s2]) + .. _whatsnew_0190.enhancements.semi_month_offsets: Semi-Month Offsets @@ -378,11 +412,11 @@ get_dummies dtypes The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`). This should provide an improved memory footprint. -Previous behaviour: +Previous Behavior: .. code-block:: ipython - In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes Out[1]: a float64 @@ -404,7 +438,7 @@ Other enhancements - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the :ref:`docs ` for more details (:issue:`13577`). -- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - ``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) .. ipython:: python @@ -448,7 +482,6 @@ Other enhancements - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, :issue:`13846`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) @@ -512,7 +545,7 @@ API changes ``Series.tolist()`` will now return Python types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behaviour (:issue:`10904`) +``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behavior (:issue:`10904`) .. ipython:: python @@ -547,7 +580,7 @@ including ``DataFrame`` (:issue:`1134`, :issue:`4581`, :issue:`13538`) .. warning:: Until 0.18.1, comparing ``Series`` with the same length, would succeed even if - the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behaviour or align different indexes, using the flexible comparison methods like ``.eq``. + the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behavior or align different indexes, using the flexible comparison methods like ``.eq``. As a result, ``Series`` and ``DataFrame`` operators behave as below: @@ -615,7 +648,7 @@ Logical operators Logical operators align both ``.index``. -Previous Behavior (``Series``), only left hand side ``index`` is kept: +Previous behavior (``Series``), only left hand side ``index`` is kept: .. code-block:: ipython @@ -934,7 +967,7 @@ Index ``+`` / ``-`` no longer used for set operations Addition and subtraction of the base Index type (not the numeric subclasses) previously performed set operations (set union and difference). This -behaviour was already deprecated since 0.15.0 (in favor using the specific +behavior was already deprecated since 0.15.0 (in favor using the specific ``.union()`` and ``.difference()`` methods), and is now disabled. When possible, ``+`` and ``-`` are now used for element-wise operations, for example for concatenating strings (:issue:`8227`, :issue:`14127`). @@ -954,13 +987,13 @@ The same operation will now perform element-wise addition: pd.Index(['a', 'b']) + pd.Index(['a', 'c']) Note that numeric Index objects already performed element-wise operations. -For example, the behaviour of adding two integer Indexes: +For example, the behavior of adding two integer Indexes: .. ipython:: python pd.Index([1, 2, 3]) + pd.Index([2, 3, 4]) -is unchanged. The base ``Index`` is now made consistent with this behaviour. +is unchanged. The base ``Index`` is now made consistent with this behavior. .. _whatsnew_0190.api.difference: @@ -1111,7 +1144,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument. data = 'A,B\n0,1\n2,3\n4,5\n6,7' -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -1123,7 +1156,7 @@ Previous behaviour: 0 4 5 1 6 7 -New behaviour: +New Behavior: .. ipython :: python @@ -1249,7 +1282,7 @@ These types are the same on many platform, but for 64 bit python on Windows, ``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many operations on that platform. -Previous behaviour: +Previous Behavior: .. code-block:: ipython @@ -1258,7 +1291,7 @@ Previous behaviour: In [2]: i.get_indexer(['b', 'b', 'c']).dtype Out[2]: dtype('int32') -New behaviour: +New Behavior: .. code-block:: ipython diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bb2d1a9d1b5d3..9a1c7864903d7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4787,10 +4787,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): [get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers], concat_axis) - blocks = [make_block(concatenate_join_units(join_units, concat_axis, - copy=copy), - placement=placement) - for placement, join_units in concat_plan] + blocks = [make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement) for placement, join_units in concat_plan] return BlockManager(blocks, axes) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index fd6fd90cd631f..23261c2ef79e2 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -185,9 +185,9 @@ def test_concat_empty_series_dtypes(self): 'category') self.assertEqual(pd.concat([Series(dtype='category'), Series(dtype='float64')]).dtype, - np.object_) + 'float64') self.assertEqual(pd.concat([Series(dtype='category'), - Series(dtype='object')]).dtype, 'category') + Series(dtype='object')]).dtype, 'object') # sparse result = pd.concat([Series(dtype='float64').to_sparse(), Series( diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index eabd118de671d..c4ddd2c0981d9 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2088,8 +2088,8 @@ def test_series_functions_no_warnings(self): def test_assignment_to_dataframe(self): # assignment - df = DataFrame({'value': np.array( - np.random.randint(0, 10000, 100), dtype='int32')}) + df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100), + dtype='int32')}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] df = df.sort_values(by=['value'], ascending=True) @@ -3355,16 +3355,15 @@ def test_slicing_and_getting_ops(self): def test_slicing_doc_examples(self): # GH 7918 - cats = Categorical( - ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]) + cats = Categorical(["a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n", ]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( - {"cats": Categorical( - ['b', 'b'], categories=['a', 'b', 'c']), + {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']), "values": [2, 2]}, index=['j', 'k']) tm.assert_frame_equal(result, expected) @@ -3379,10 +3378,9 @@ def test_slicing_doc_examples(self): tm.assert_series_equal(result, expected) result = df.ix["h":"j", 0:1] - expected = DataFrame({'cats': Series( - Categorical( - ['a', 'b', 'b'], categories=['a', 'b', 'c']), index=['h', 'i', - 'j'])}) + expected = DataFrame({'cats': Categorical(['a', 'b', 'b'], + categories=['a', 'b', 'c'])}, + index=['h', 'i', 'j']) tm.assert_frame_equal(result, expected) def test_assigning_ops(self): @@ -3636,8 +3634,8 @@ def f(): with tm.assertRaises(ValueError): # different values df = orig.copy() - df.ix["j":"k", 0] = pd.Categorical( - ["c", "c"], categories=["a", "b", "c"]) + df.ix["j":"k", 0] = pd.Categorical(["c", "c"], + categories=["a", "b", "c"]) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3674,8 +3672,8 @@ def f(): self.assertRaises(ValueError, f) # fancy indexing - catsf = pd.Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]) + catsf = pd.Categorical(["a", "a", "c", "c", "a", "a", "a"], + categories=["a", "b", "c"]) idxf = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = pd.DataFrame({"cats": catsf, "values": valuesf}, index=idxf) @@ -3733,9 +3731,8 @@ def f(): s = orig.copy() s.index = ["x", "y"] s["y"] = "a" - exp = Series( - pd.Categorical(["b", "a"], - categories=["a", "b"]), index=["x", "y"]) + exp = Series(pd.Categorical(["b", "a"], categories=["a", "b"]), + index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan @@ -3887,7 +3884,7 @@ def test_cat_equality(self): self.assertRaises(TypeError, lambda: a > b) self.assertRaises(TypeError, lambda: b > a) - def test_concat(self): + def test_concat_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] df = pd.DataFrame({"cats": cat, "vals": vals}) @@ -3896,20 +3893,22 @@ def test_concat(self): exp = pd.DataFrame({"cats": cat2, "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - res = pd.concat([df, df]) - tm.assert_frame_equal(exp, res) + tm.assert_frame_equal(pd.concat([df, df]), exp) + tm.assert_frame_equal(df.append(df), exp) - # Concat should raise if the two categoricals do not have the same - # categories + # GH 13524 can concat different categories cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) + df_different_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - def f(): - pd.concat([df, df_wrong_categories]) + res = pd.concat([df, df_different_categories], ignore_index=True) + exp = pd.DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) - self.assertRaises(ValueError, f) + res = df.append(df_different_categories, ignore_index=True) + tm.assert_frame_equal(res, exp) + def test_concat_append_gh7864(self): # GH 7864 # make sure ordering is preserverd df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], @@ -3926,41 +3925,44 @@ def f(): df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) - dfx['grade'].cat.categories self.assert_index_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + dfa = df1.append(df2) + self.assert_index_equal(df['grade'].cat.categories, + dfa['grade'].cat.categories) + + def test_concat_preserve(self): - # GH 8641 - # series concat not preserving category dtype + # GH 8641 series concat not preserving category dtype + # GH 13524 can concat different categories s = Series(list('abc'), dtype='category') s2 = Series(list('abd'), dtype='category') - def f(): - pd.concat([s, s2]) - - self.assertRaises(ValueError, f) + exp = Series(list('abcabd')) + res = pd.concat([s, s2], ignore_index=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s], ignore_index=True) - expected = Series(list('abcabc')).astype('category') - tm.assert_series_equal(result, expected) + exp = Series(list('abcabc'), dtype='category') + res = pd.concat([s, s], ignore_index=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s]) - expected = Series( - list('abcabc'), index=[0, 1, 2, 0, 1, 2]).astype('category') - tm.assert_series_equal(result, expected) + exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2], + dtype='category') + res = pd.concat([s, s]) + tm.assert_series_equal(res, exp) a = Series(np.arange(6, dtype='int64')) b = Series(list('aabbca')) df2 = DataFrame({'A': a, 'B': b.astype('category', categories=list('cab'))}) - result = pd.concat([df2, df2]) - expected = DataFrame({'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype( - 'category', categories=list('cab'))}) - tm.assert_frame_equal(result, expected) + res = pd.concat([df2, df2]) + exp = DataFrame({'A': pd.concat([a, a]), + 'B': pd.concat([b, b]).astype( + 'category', categories=list('cab'))}) + tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): @@ -3968,44 +3970,21 @@ def test_categorical_index_preserver(self): b = Series(list('aabbca')) df2 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'cab'))}).set_index('B') + 'B': b.astype('category', categories=list('cab')) + }).set_index('B') result = pd.concat([df2, df2]) expected = DataFrame({'A': pd.concat([a, a]), 'B': pd.concat([b, b]).astype( - 'category', categories=list( - 'cab'))}).set_index('B') + 'category', categories=list('cab')) + }).set_index('B') tm.assert_frame_equal(result, expected) # wrong catgories df3 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'abc'))}).set_index('B') + 'B': pd.Categorical(b, categories=list('abc')) + }).set_index('B') self.assertRaises(TypeError, lambda: pd.concat([df2, df3])) - def test_append(self): - cat = pd.Categorical(["a", "b"], categories=["a", "b"]) - vals = [1, 2] - df = pd.DataFrame({"cats": cat, "vals": vals}) - cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"]) - vals2 = [1, 2, 1, 2] - exp = pd.DataFrame({"cats": cat2, - "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - - res = df.append(df) - tm.assert_frame_equal(exp, res) - - # Concat should raise if the two categoricals do not have the same - # categories - cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) - vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - - def f(): - df.append(df_wrong_categories) - - self.assertRaises(ValueError, f) - def test_merge(self): # GH 9426 @@ -4470,27 +4449,22 @@ def test_dt_accessor_api_for_categorical(self): def test_concat_categorical(self): # See GH 10177 - df1 = pd.DataFrame( - np.arange(18, dtype='int64').reshape(6, - 3), columns=["a", "b", "c"]) - - df2 = pd.DataFrame( - np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"]) - df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two", - "two", "one"])) - - df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True) - - df_expected = pd.DataFrame( - {'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan], - 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]}) - df_expected['h'] = pd.Series(pd.Categorical( - [None, None, None, None, None, None, "one", "one", "two", "one", - "two", "two", "one"])) - - tm.assert_frame_equal(df_expected, df_concat) + df1 = pd.DataFrame(np.arange(18, dtype='int64').reshape(6, 3), + columns=["a", "b", "c"]) + + df2 = pd.DataFrame(np.arange(14, dtype='int64').reshape(7, 2), + columns=["a", "c"]) + + cat_values = ["one", "one", "two", "one", "two", "two", "one"] + df2['h'] = pd.Series(pd.Categorical(cat_values)) + + res = pd.concat((df1, df2), axis=0, ignore_index=True) + exp = pd.DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], + 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + 'h': [None] * 6 + cat_values}) + tm.assert_frame_equal(res, exp) class TestCategoricalSubclassing(tm.TestCase): diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 7a29918c55658..6521acbd0b733 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1290,9 +1290,12 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, join_axes : list of Index objects Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic - verify_integrity : boolean, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation + ignore_index : boolean, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. keys : sequence, default None If multiple levels passed, should contain tuples. Construct hierarchical index using the passed keys as the outermost level @@ -1301,12 +1304,9 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, MultiIndex. Otherwise they will be inferred from the keys names : list, default None Names for the levels in the resulting hierarchical index - ignore_index : boolean, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. + verify_integrity : boolean, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation copy : boolean, default True If False, do not copy data unnecessarily @@ -1512,10 +1512,9 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) - new_data = concatenate_block_managers(mgrs_indexers, - self.new_axes, - concat_axis=self.axis, - copy=self.copy) + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, + copy=self.copy) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 102f21bcdc535..8e20cfa83c405 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -450,6 +450,254 @@ def test_concatlike_common_period_mixed_dt_to_object(self): res = pd.concat([tds, ps1]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + def test_concat_categorical(self): + # GH 13524 + + # same categories -> category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category') + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1], dtype='category') + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # completelly different categories (same dtype) => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') + + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_coercion(self): + # GH 13524 + + # category + not-category => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # result shouldn't be affected by 1st elem dtype + exp = pd.Series([2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all values are not in category => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1]) + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([2, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completelly different categories => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([1, 3, 2]) + + exp = pd.Series([10, 11, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([1, 3, 2, 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # different dtype => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series(['a', 'b', 'c']) + + exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c']) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # if normal series only contains NaN-likes => not-category + s1 = pd.Series([10, 11], dtype='category') + s2 = pd.Series([np.nan, np.nan, np.nan]) + + exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_concat_categorical_3elem_coercion(self): + # GH 13524 + + # mixed dtypes => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + s3 = pd.Series([1, 2, 1, 2, np.nan]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([1, 3, 4]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([10, 11, 12]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + def test_concat_categorical_multi_coercion(self): + # GH 13524 + + s1 = pd.Series([1, 3], dtype='category') + s2 = pd.Series([3, 4], dtype='category') + s3 = pd.Series([2, 3]) + s4 = pd.Series([2, 2], dtype='category') + s5 = pd.Series([1, np.nan]) + s6 = pd.Series([1, 3, 2], dtype='category') + + # mixed dtype, values are all in categories => not-category + exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_ordered(self): + # GH 13524 + + s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True)) + s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True)) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], + ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + + def test_concat_categorical_coercion_nan(self): + # GH 13524 + + # some edge cases + # category + not-category => not category + s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), + dtype='category') + s2 = pd.Series([np.nan, 1]) + + exp = pd.Series([np.nan, np.nan, np.nan, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + s1 = pd.Series([1, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([1, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # mixed dtype, all nan-likes => not-category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all category nan-likes => category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan], dtype='category') + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_empty(self): + # GH 13524 + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([1, 2], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([]) + + # different dtype => not-category + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + # empty Series is ignored + exp = pd.Series([np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + class TestAppend(ConcatenateBase): diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 29a0fe7d9f8d0..8bdd71348a537 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -5,7 +5,6 @@ import numpy as np import pandas.tslib as tslib from pandas import compat -from pandas.compat import map from pandas.core.algorithms import take_1d from .common import (is_categorical_dtype, is_sparse, @@ -133,19 +132,21 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) - # these are mandated to handle empties as well _contains_datetime = any(typ.startswith('datetime') for typ in typs) _contains_period = any(typ.startswith('period') for typ in typs) - if _contains_datetime or 'timedelta' in typs or _contains_period: + if 'category' in typs: + # this must be priort to _concat_datetime, + # to support Categorical + datetime-like + return _concat_categorical(to_concat, axis=axis) + + elif _contains_datetime or 'timedelta' in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) + # these are mandated to handle empties as well elif 'sparse' in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) - elif 'category' in typs: - return _concat_categorical(to_concat, axis=axis) - if not nonempty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise @@ -181,18 +182,14 @@ def _concat_categorical(to_concat, axis=0): A single array, preserving the combined dtypes """ - from pandas.core.categorical import Categorical - - def convert_categorical(x): - # coerce to object dtype - if is_categorical_dtype(x.dtype): - return x.get_values() - return x.ravel() - - if get_dtype_kinds(to_concat) - set(['object', 'category']): - # convert to object type and perform a regular concat - return _concat_compat([np.array(x, copy=False, dtype=object) - for x in to_concat], axis=0) + def _concat_asobject(to_concat): + to_concat = [x.get_values() if is_categorical_dtype(x.dtype) + else x.ravel() for x in to_concat] + res = _concat_compat(to_concat) + if axis == 1: + return res.reshape(1, len(res)) + else: + return res # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything @@ -200,25 +197,15 @@ def convert_categorical(x): categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories - categories = categoricals[0] - rawcats = categories.categories - for x in categoricals[1:]: - if not categories.is_dtype_equal(x): - raise ValueError("incompatible categories in categorical concat") - - # we've already checked that all categoricals are the same, so if their - # length is equal to the input then we have all the same categories - if len(categoricals) == len(to_concat): - # concating numeric types is much faster than concating object types - # and fastpath takes a shorter path through the constructor - return Categorical(np.concatenate([x.codes for x in to_concat], - axis=0), - rawcats, ordered=categoricals[0].ordered, - fastpath=True) + if len(categoricals) != len(to_concat): + pass else: - concatted = np.concatenate(list(map(convert_categorical, to_concat)), - axis=0) - return Categorical(concatted, rawcats) + # when all categories are identical + first = to_concat[0] + if all(first.is_dtype_equal(other) for other in to_concat[1:]): + return union_categoricals(categoricals) + + return _concat_asobject(to_concat) def union_categoricals(to_union, sort_categories=False):