diff --git a/doc/source/merging.rst b/doc/source/merging.rst index d78e476dd7837..98914c13d4d31 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -506,8 +506,8 @@ You can also pass a list of dicts or Series: .. _merging.join: -Database-style DataFrame joining/merging ----------------------------------------- +Database-style DataFrame or named Series joining/merging +-------------------------------------------------------- pandas has full-featured, **high performance** in-memory join operations idiomatically very similar to relational databases like SQL. These methods @@ -522,7 +522,7 @@ Users who are familiar with SQL but new to pandas might be interested in a :ref:`comparison with SQL`. pandas provides a single function, :func:`~pandas.merge`, as the entry point for -all standard database join operations between ``DataFrame`` objects: +all standard database join operations between ``DataFrame`` or named ``Series`` objects: :: @@ -531,23 +531,23 @@ all standard database join operations between ``DataFrame`` objects: suffixes=('_x', '_y'), copy=True, indicator=False, validate=None) -* ``left``: A DataFrame object. -* ``right``: Another DataFrame object. +* ``left``: A DataFrame or named Series object. +* ``right``: Another DataFrame or named Series object. * ``on``: Column or index level names to join on. Must be found in both the left - and right DataFrame objects. If not passed and ``left_index`` and + and right DataFrame and/or Series objects. If not passed and ``left_index`` and ``right_index`` are ``False``, the intersection of the columns in the - DataFrames will be inferred to be the join keys. -* ``left_on``: Columns or index levels from the left DataFrame to use as + DataFrames and/or Series will be inferred to be the join keys. +* ``left_on``: Columns or index levels from the left DataFrame or Series to use as keys. Can either be column names, index level names, or arrays with length - equal to the length of the DataFrame. -* ``right_on``: Columns or index levels from the right DataFrame to use as + equal to the length of the DataFrame or Series. +* ``right_on``: Columns or index levels from the right DataFrame or Series to use as keys. Can either be column names, index level names, or arrays with length - equal to the length of the DataFrame. + equal to the length of the DataFrame or Series. * ``left_index``: If ``True``, use the index (row labels) from the left - DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex + DataFrame or Series as its join key(s). In the case of a DataFrame or Series with a MultiIndex (hierarchical), the number of levels must match the number of join keys - from the right DataFrame. -* ``right_index``: Same usage as ``left_index`` for the right DataFrame + from the right DataFrame or Series. +* ``right_index``: Same usage as ``left_index`` for the right DataFrame or Series * ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults to ``inner``. See below for more detailed description of each method. * ``sort``: Sort the result DataFrame by the join keys in lexicographical @@ -555,7 +555,7 @@ all standard database join operations between ``DataFrame`` objects: substantially in many cases. * ``suffixes``: A tuple of string suffixes to apply to overlapping columns. Defaults to ``('_x', '_y')``. -* ``copy``: Always copy data (default ``True``) from the passed DataFrame +* ``copy``: Always copy data (default ``True``) from the passed DataFrame or named Series objects, even when reindexing is not necessary. Cannot be avoided in many cases but may improve performance / memory usage. The cases where copying can be avoided are somewhat pathological but this option is provided @@ -563,8 +563,8 @@ all standard database join operations between ``DataFrame`` objects: * ``indicator``: Add a column to the output DataFrame called ``_merge`` with information on the source of each row. ``_merge`` is Categorical-type and takes on a value of ``left_only`` for observations whose merge key - only appears in ``'left'`` DataFrame, ``right_only`` for observations whose - merge key only appears in ``'right'`` DataFrame, and ``both`` if the + only appears in ``'left'`` DataFrame or Series, ``right_only`` for observations whose + merge key only appears in ``'right'`` DataFrame or Series, and ``both`` if the observation's merge key is found in both. * ``validate`` : string, default None. @@ -584,10 +584,10 @@ all standard database join operations between ``DataFrame`` objects: Support for specifying index levels as the ``on``, ``left_on``, and ``right_on`` parameters was added in version 0.23.0. + Support for merging named ``Series`` objects was added in version 0.24.0. -The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` -and ``right`` is a subclass of DataFrame, the return type will still be -``DataFrame``. +The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` or named ``Series`` +and ``right`` is a subclass of ``DataFrame``, the return type will still be ``DataFrame``. ``merge`` is a function in the pandas namespace, and it is also available as a ``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index b015495b095b6..769bda992956b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -12,6 +12,7 @@ v0.24.0 (Month XX, 2018) New features ~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4578d2ac08199..873170eb9813b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -137,8 +137,8 @@ """ _merge_doc = """ -Merge DataFrame objects by performing a database-style join operation by -columns or indexes. +Merge DataFrame or named Series objects by performing a database-style join +operation by columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or @@ -146,7 +146,7 @@ Parameters ----------%s -right : DataFrame, Series or dict +right : DataFrame or named Series Object to merge with. how : {'left', 'right', 'outer', 'inner'}, default 'inner' Type of merge to be performed. @@ -217,6 +217,7 @@ ----- Support for specifying index levels as the `on`, `left_on`, and `right_on` parameters was added in version 0.23.0 +Support for merging named Series objects was added in version 0.24.0 See Also -------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 25d8cb4e804a2..caaeb1bad2358 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -11,7 +11,7 @@ import pandas.compat as compat from pandas import (Categorical, DataFrame, - Index, MultiIndex, Timedelta) + Index, MultiIndex, Timedelta, Series) from pandas.core.arrays.categorical import _recode_for_categories from pandas.core.frame import _merge_doc from pandas.core.dtypes.common import ( @@ -493,6 +493,8 @@ def __init__(self, left, right, how='inner', on=None, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None): + left = validate_operand(left) + right = validate_operand(right) self.left = self.orig_left = left self.right = self.orig_right = right self.how = how @@ -519,13 +521,6 @@ def __init__(self, left, right, how='inner', on=None, raise ValueError( 'indicator option can only accept boolean or string arguments') - if not isinstance(left, DataFrame): - raise ValueError('can not merge DataFrame with instance of ' - 'type {left}'.format(left=type(left))) - if not isinstance(right, DataFrame): - raise ValueError('can not merge DataFrame with instance of ' - 'type {right}'.format(right=type(right))) - if not is_bool(left_index): raise ValueError( 'left_index parameter must be of type bool, not ' @@ -1645,3 +1640,16 @@ def _should_fill(lname, rname): def _any(x): return x is not None and com._any_not_none(*x) + + +def validate_operand(obj): + if isinstance(obj, DataFrame): + return obj + elif isinstance(obj, Series): + if obj.name is None: + raise ValueError('Cannot merge a Series without a name') + else: + return obj.to_frame() + else: + raise TypeError('Can only merge Series or DataFrame objects, ' + 'a {obj} was passed'.format(obj=type(obj))) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 1b8f3632d381c..09f511886583c 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -228,16 +228,18 @@ def test_join_on_fails_with_different_column_counts(self): index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b']) - def test_join_on_fails_with_wrong_object_type(self): - # GH12081 - wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] - df = DataFrame({'a': [1, 1]}) + @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) + def test_join_on_fails_with_wrong_object_type(self, wrong_type): + # GH12081 - original issue + + # GH21220 - merging of Series and DataFrame is now allowed + # Edited test to remove the Series object from test parameters - for obj in wrongly_typed: - with tm.assert_raises_regex(ValueError, str(type(obj))): - merge(obj, df, left_on='a', right_on='a') - with tm.assert_raises_regex(ValueError, str(type(obj))): - merge(df, obj, left_on='a', right_on='a') + df = DataFrame({'a': [1, 1]}) + with tm.assert_raises_regex(TypeError, str(type(wrong_type))): + merge(wrong_type, df, left_on='a', right_on='a') + with tm.assert_raises_regex(TypeError, str(type(wrong_type))): + merge(df, wrong_type, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 037bd9cc7cd18..42df4511578f1 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1887,3 +1887,33 @@ def test_merge_index_types(index): OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]), index=index) assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("on,left_on,right_on,left_index,right_index,nms,nm", [ + (['outer', 'inner'], None, None, False, False, ['outer', 'inner'], 'B'), + (None, None, None, True, True, ['outer', 'inner'], 'B'), + (None, ['outer', 'inner'], None, False, True, None, 'B'), + (None, None, ['outer', 'inner'], True, False, None, 'B'), + (['outer', 'inner'], None, None, False, False, ['outer', 'inner'], None), + (None, None, None, True, True, ['outer', 'inner'], None), + (None, ['outer', 'inner'], None, False, True, None, None), + (None, None, ['outer', 'inner'], True, False, None, None)]) +def test_merge_series(on, left_on, right_on, left_index, right_index, nms, nm): + # GH 21220 + a = pd.DataFrame({"A": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]], + names=['outer', 'inner'])) + b = pd.Series([1, 2, 3, 4], + index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=['outer', 'inner']), name=nm) + expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]}, + index=pd.MultiIndex.from_product([['a', 'b'], [1]], + names=nms)) + if nm is not None: + result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, + left_index=left_index, right_index=right_index) + tm.assert_frame_equal(result, expected) + else: + with tm.assert_raises_regex(ValueError, 'a Series without a name'): + result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, + left_index=left_index, right_index=right_index)