ENH: Merge DataFrame and Series using on (GH21220) (#21223)

KalyanGokhale · jorisvandenbossche · commit b97545563f72 · 2018-07-23T19:02:13.000+02:00
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -506,8 +506,8 @@ You can also pass a list of dicts or Series:
 
 .. _merging.join:
 
-Database-style DataFrame joining/merging
-----------------------------------------
+Database-style DataFrame or named Series joining/merging
+--------------------------------------------------------
 
 pandas has full-featured, **high performance** in-memory join operations
 idiomatically very similar to relational databases like SQL. These methods
@@ -522,7 +522,7 @@ Users who are familiar with SQL but new to pandas might be interested in a
 :ref:`comparison with SQL<compare_with_sql.join>`.
 
 pandas provides a single function, :func:`~pandas.merge`, as the entry point for 
-all standard database join operations between ``DataFrame`` objects:
+all standard database join operations between ``DataFrame`` or named ``Series`` objects:
 
 ::
 
@@ -531,40 +531,40 @@ all standard database join operations between ``DataFrame`` objects:
              suffixes=('_x', '_y'), copy=True, indicator=False,
              validate=None)
 
-* ``left``: A DataFrame object.
-* ``right``: Another DataFrame object.
+* ``left``: A DataFrame or named Series object.
+* ``right``: Another DataFrame or named Series object.
 * ``on``: Column or index level names to join on. Must be found in both the left
-  and right DataFrame objects. If not passed and ``left_index`` and
+  and right DataFrame and/or Series objects. If not passed and ``left_index`` and
   ``right_index`` are ``False``, the intersection of the columns in the
-  DataFrames will be inferred to be the join keys.
-* ``left_on``: Columns or index levels from the left DataFrame to use as
+  DataFrames and/or Series will be inferred to be the join keys.
+* ``left_on``: Columns or index levels from the left DataFrame or Series to use as
   keys. Can either be column names, index level names, or arrays with length
-  equal to the length of the DataFrame.
-* ``right_on``: Columns or index levels from the right DataFrame to use as
+  equal to the length of the DataFrame or Series.
+* ``right_on``: Columns or index levels from the right DataFrame or Series to use as
   keys. Can either be column names, index level names, or arrays with length
-  equal to the length of the DataFrame.
+  equal to the length of the DataFrame or Series.
 * ``left_index``: If ``True``, use the index (row labels) from the left
-  DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex
+  DataFrame or Series as its join key(s). In the case of a DataFrame or Series with a MultiIndex
   (hierarchical), the number of levels must match the number of join keys
-  from the right DataFrame.
-* ``right_index``: Same usage as ``left_index`` for the right DataFrame
+  from the right DataFrame or Series.
+* ``right_index``: Same usage as ``left_index`` for the right DataFrame or Series
 * ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults
   to ``inner``. See below for more detailed description of each method.
 * ``sort``: Sort the result DataFrame by the join keys in lexicographical
   order. Defaults to ``True``, setting to ``False`` will improve performance
   substantially in many cases.
 * ``suffixes``: A tuple of string suffixes to apply to overlapping
   columns. Defaults to ``('_x', '_y')``.
-* ``copy``: Always copy data (default ``True``) from the passed DataFrame
+* ``copy``: Always copy data (default ``True``) from the passed DataFrame or named Series
   objects, even when reindexing is not necessary. Cannot be avoided in many
   cases but may improve performance / memory usage. The cases where copying
   can be avoided are somewhat pathological but this option is provided
   nonetheless.
 * ``indicator``: Add a column to the output DataFrame called ``_merge``
   with information on the source of each row. ``_merge`` is Categorical-type
   and takes on a value of ``left_only`` for observations whose merge key
-  only appears in ``'left'`` DataFrame, ``right_only`` for observations whose
-  merge key only appears in ``'right'`` DataFrame, and ``both`` if the
+  only appears in ``'left'`` DataFrame or Series, ``right_only`` for observations whose
+  merge key only appears in ``'right'`` DataFrame or Series, and ``both`` if the
   observation's merge key is found in both.
 
 * ``validate`` : string, default None.
@@ -584,10 +584,10 @@ all standard database join operations between ``DataFrame`` objects:
 
    Support for specifying index levels as the ``on``, ``left_on``, and
    ``right_on`` parameters was added in version 0.23.0.
+   Support for merging named ``Series`` objects was added in version 0.24.0.
 
-The return type will be the same as ``left``. If ``left`` is a ``DataFrame``
-and ``right`` is a subclass of DataFrame, the return type will still be
-``DataFrame``.
+The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` or named ``Series``
+and ``right`` is a subclass of ``DataFrame``, the return type will still be ``DataFrame``.
 
 ``merge`` is a function in the pandas namespace, and it is also available as a
 ``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling 
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -12,6 +12,7 @@ v0.24.0 (Month XX, 2018)
 
 New features
 ~~~~~~~~~~~~
+- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`)
 
 
 - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -137,16 +137,16 @@
 """
 
 _merge_doc = """
-Merge DataFrame objects by performing a database-style join operation by
-columns or indexes.
+Merge DataFrame or named Series objects by performing a database-style join
+operation by columns or indexes.
 
 If joining columns on columns, the DataFrame indexes *will be
 ignored*. Otherwise if joining indexes on indexes or indexes on a column or
 columns, the index will be passed on.
 
 Parameters
 ----------%s
-right : DataFrame, Series or dict
+right : DataFrame or named Series
     Object to merge with.
 how : {'left', 'right', 'outer', 'inner'}, default 'inner'
     Type of merge to be performed.
@@ -217,6 +217,7 @@
 -----
 Support for specifying index levels as the `on`, `left_on`, and
 `right_on` parameters was added in version 0.23.0
+Support for merging named Series objects was added in version 0.24.0
 
 See Also
 --------
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -11,7 +11,7 @@
 import pandas.compat as compat
 
 from pandas import (Categorical, DataFrame,
-                    Index, MultiIndex, Timedelta)
+                    Index, MultiIndex, Timedelta, Series)
 from pandas.core.arrays.categorical import _recode_for_categories
 from pandas.core.frame import _merge_doc
 from pandas.core.dtypes.common import (
@@ -493,6 +493,8 @@ def __init__(self, left, right, how='inner', on=None,
                  left_index=False, right_index=False, sort=True,
                  suffixes=('_x', '_y'), copy=True, indicator=False,
                  validate=None):
+        left = validate_operand(left)
+        right = validate_operand(right)
         self.left = self.orig_left = left
         self.right = self.orig_right = right
         self.how = how
@@ -519,13 +521,6 @@ def __init__(self, left, right, how='inner', on=None,
             raise ValueError(
                 'indicator option can only accept boolean or string arguments')
 
-        if not isinstance(left, DataFrame):
-            raise ValueError('can not merge DataFrame with instance of '
-                             'type {left}'.format(left=type(left)))
-        if not isinstance(right, DataFrame):
-            raise ValueError('can not merge DataFrame with instance of '
-                             'type {right}'.format(right=type(right)))
-
         if not is_bool(left_index):
             raise ValueError(
                 'left_index parameter must be of type bool, not '
@@ -1645,3 +1640,16 @@ def _should_fill(lname, rname):
 
 def _any(x):
     return x is not None and com._any_not_none(*x)
+
+
+def validate_operand(obj):
+    if isinstance(obj, DataFrame):
+        return obj
+    elif isinstance(obj, Series):
+        if obj.name is None:
+            raise ValueError('Cannot merge a Series without a name')
+        else:
+            return obj.to_frame()
+    else:
+        raise TypeError('Can only merge Series or DataFrame objects, '
+                        'a {obj} was passed'.format(obj=type(obj)))
diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py
@@ -228,16 +228,18 @@ def test_join_on_fails_with_different_column_counts(self):
                             index=tm.makeCustomIndex(10, 2))
             merge(df, df2, right_on='a', left_on=['a', 'b'])
 
-    def test_join_on_fails_with_wrong_object_type(self):
-        # GH12081
-        wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
-        df = DataFrame({'a': [1, 1]})
+    @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
+    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
+        # GH12081 - original issue
+
+        # GH21220 - merging of Series and DataFrame is now allowed
+        # Edited test to remove the Series object from test parameters
 
-        for obj in wrongly_typed:
-            with tm.assert_raises_regex(ValueError, str(type(obj))):
-                merge(obj, df, left_on='a', right_on='a')
-            with tm.assert_raises_regex(ValueError, str(type(obj))):
-                merge(df, obj, left_on='a', right_on='a')
+        df = DataFrame({'a': [1, 1]})
+        with tm.assert_raises_regex(TypeError, str(type(wrong_type))):
+            merge(wrong_type, df, left_on='a', right_on='a')
+        with tm.assert_raises_regex(TypeError, str(type(wrong_type))):
+            merge(df, wrong_type, left_on='a', right_on='a')
 
     def test_join_on_pass_vector(self):
         expected = self.target.join(self.source, on='C')
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1887,3 +1887,33 @@ def test_merge_index_types(index):
         OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]),
         index=index)
     assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("on,left_on,right_on,left_index,right_index,nms,nm", [
+    (['outer', 'inner'], None, None, False, False, ['outer', 'inner'], 'B'),
+    (None, None, None, True, True, ['outer', 'inner'], 'B'),
+    (None, ['outer', 'inner'], None, False, True, None, 'B'),
+    (None, None, ['outer', 'inner'], True, False, None, 'B'),
+    (['outer', 'inner'], None, None, False, False, ['outer', 'inner'], None),
+    (None, None, None, True, True, ['outer', 'inner'], None),
+    (None, ['outer', 'inner'], None, False, True, None, None),
+    (None, None, ['outer', 'inner'], True, False, None, None)])
+def test_merge_series(on, left_on, right_on, left_index, right_index, nms, nm):
+    # GH 21220
+    a = pd.DataFrame({"A": [1, 2, 3, 4]},
+                     index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]],
+                     names=['outer', 'inner']))
+    b = pd.Series([1, 2, 3, 4],
+                  index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]],
+                  names=['outer', 'inner']), name=nm)
+    expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]},
+                            index=pd.MultiIndex.from_product([['a', 'b'], [1]],
+                            names=nms))
+    if nm is not None:
+        result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
+                          left_index=left_index, right_index=right_index)
+        tm.assert_frame_equal(result, expected)
+    else:
+        with tm.assert_raises_regex(ValueError, 'a Series without a name'):
+            result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
+                              left_index=left_index, right_index=right_index)