diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index aee56a2565310..8dbfc261e6fa8 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -7,6 +7,7 @@ from matplotlib import pyplot as plt import pandas.util._doctools as doctools + p = doctools.TablePlotter() @@ -38,23 +39,35 @@ a simple example: .. ipython:: python - df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=[0, 1, 2, 3]) + df1 = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + }, + index=[0, 1, 2, 3], + ) - df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], - 'B': ['B4', 'B5', 'B6', 'B7'], - 'C': ['C4', 'C5', 'C6', 'C7'], - 'D': ['D4', 'D5', 'D6', 'D7']}, - index=[4, 5, 6, 7]) + df2 = pd.DataFrame( + { + "A": ["A4", "A5", "A6", "A7"], + "B": ["B4", "B5", "B6", "B7"], + "C": ["C4", "C5", "C6", "C7"], + "D": ["D4", "D5", "D6", "D7"], + }, + index=[4, 5, 6, 7], + ) - df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], - 'B': ['B8', 'B9', 'B10', 'B11'], - 'C': ['C8', 'C9', 'C10', 'C11'], - 'D': ['D8', 'D9', 'D10', 'D11']}, - index=[8, 9, 10, 11]) + df3 = pd.DataFrame( + { + "A": ["A8", "A9", "A10", "A11"], + "B": ["B8", "B9", "B10", "B11"], + "C": ["C8", "C9", "C10", "C11"], + "D": ["D8", "D9", "D10", "D11"], + }, + index=[8, 9, 10, 11], + ) frames = [df1, df2, df3] result = pd.concat(frames) @@ -109,7 +122,7 @@ with each of the pieces of the chopped up DataFrame. We can do this using the .. ipython:: python - result = pd.concat(frames, keys=['x', 'y', 'z']) + result = pd.concat(frames, keys=["x", "y", "z"]) .. ipython:: python :suppress: @@ -125,7 +138,7 @@ means that we can now select out each chunk by key: .. ipython:: python - result.loc['y'] + result.loc["y"] It's not a stretch to see how this can be very useful. More detail on this functionality below. @@ -158,10 +171,14 @@ behavior: .. ipython:: python - df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], - 'D': ['D2', 'D3', 'D6', 'D7'], - 'F': ['F2', 'F3', 'F6', 'F7']}, - index=[2, 3, 6, 7]) + df4 = pd.DataFrame( + { + "B": ["B2", "B3", "B6", "B7"], + "D": ["D2", "D3", "D6", "D7"], + "F": ["F2", "F3", "F6", "F7"], + }, + index=[2, 3, 6, 7], + ) result = pd.concat([df1, df4], axis=1, sort=False) @@ -184,7 +201,7 @@ Here is the same thing with ``join='inner'``: .. ipython:: python - result = pd.concat([df1, df4], axis=1, join='inner') + result = pd.concat([df1, df4], axis=1, join="inner") .. ipython:: python :suppress: @@ -316,7 +333,7 @@ the name of the ``Series``. .. ipython:: python - s1 = pd.Series(['X0', 'X1', 'X2', 'X3'], name='X') + s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X") result = pd.concat([df1, s1], axis=1) .. ipython:: python @@ -338,7 +355,7 @@ If unnamed ``Series`` are passed they will be numbered consecutively. .. ipython:: python - s2 = pd.Series(['_0', '_1', '_2', '_3']) + s2 = pd.Series(["_0", "_1", "_2", "_3"]) result = pd.concat([df1, s2, s2, s2], axis=1) .. ipython:: python @@ -373,7 +390,7 @@ inherit the parent ``Series``' name, when these existed. .. ipython:: python - s3 = pd.Series([0, 1, 2, 3], name='foo') + s3 = pd.Series([0, 1, 2, 3], name="foo") s4 = pd.Series([0, 1, 2, 3]) s5 = pd.Series([0, 1, 4, 5]) @@ -383,13 +400,13 @@ Through the ``keys`` argument we can override the existing column names. .. ipython:: python - pd.concat([s3, s4, s5], axis=1, keys=['red', 'blue', 'yellow']) + pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"]) Let's consider a variation of the very first example presented: .. ipython:: python - result = pd.concat(frames, keys=['x', 'y', 'z']) + result = pd.concat(frames, keys=["x", "y", "z"]) .. ipython:: python :suppress: @@ -404,7 +421,7 @@ for the ``keys`` argument (unless other keys are specified): .. ipython:: python - pieces = {'x': df1, 'y': df2, 'z': df3} + pieces = {"x": df1, "y": df2, "z": df3} result = pd.concat(pieces) .. ipython:: python @@ -417,7 +434,7 @@ for the ``keys`` argument (unless other keys are specified): .. ipython:: python - result = pd.concat(pieces, keys=['z', 'y']) + result = pd.concat(pieces, keys=["z", "y"]) .. ipython:: python :suppress: @@ -439,9 +456,9 @@ do so using the ``levels`` argument: .. ipython:: python - result = pd.concat(pieces, keys=['x', 'y', 'z'], - levels=[['z', 'y', 'x', 'w']], - names=['group_key']) + result = pd.concat( + pieces, keys=["x", "y", "z"], levels=[["z", "y", "x", "w"]], names=["group_key"] + ) .. ipython:: python :suppress: @@ -469,7 +486,7 @@ append a single row to a ``DataFrame`` by passing a ``Series`` or dict to .. ipython:: python - s2 = pd.Series(['X0', 'X1', 'X2', 'X3'], index=['A', 'B', 'C', 'D']) + s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"]) result = df1.append(s2, ignore_index=True) .. ipython:: python @@ -488,8 +505,7 @@ You can also pass a list of dicts or Series: .. ipython:: python - dicts = [{'A': 1, 'B': 2, 'C': 3, 'X': 4}, - {'A': 5, 'B': 6, 'C': 7, 'Y': 8}] + dicts = [{"A": 1, "B": 2, "C": 3, "X": 4}, {"A": 5, "B": 6, "C": 7, "Y": 8}] result = df1.append(dicts, ignore_index=True, sort=False) .. ipython:: python @@ -619,14 +635,22 @@ key combination: .. ipython:: python - left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3']}) + left = pd.DataFrame( + { + "key": ["K0", "K1", "K2", "K3"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) - right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}) - result = pd.merge(left, right, on='key') + right = pd.DataFrame( + { + "key": ["K0", "K1", "K2", "K3"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) + result = pd.merge(left, right, on="key") .. ipython:: python :suppress: @@ -642,17 +666,25 @@ appearing in ``left`` and ``right`` are present (the intersection), since .. ipython:: python - left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], - 'key2': ['K0', 'K1', 'K0', 'K1'], - 'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3']}) + left = pd.DataFrame( + { + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) - right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], - 'key2': ['K0', 'K0', 'K0', 'K0'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}) + right = pd.DataFrame( + { + "key1": ["K0", "K1", "K1", "K2"], + "key2": ["K0", "K0", "K0", "K0"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) - result = pd.merge(left, right, on=['key1', 'key2']) + result = pd.merge(left, right, on=["key1", "key2"]) .. ipython:: python :suppress: @@ -678,7 +710,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='left', on=['key1', 'key2']) + result = pd.merge(left, right, how="left", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -690,7 +722,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='right', on=['key1', 'key2']) + result = pd.merge(left, right, how="right", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -701,7 +733,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='outer', on=['key1', 'key2']) + result = pd.merge(left, right, how="outer", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -713,7 +745,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='inner', on=['key1', 'key2']) + result = pd.merge(left, right, how="inner", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -741,18 +773,18 @@ as shown in the following example. ) ser - pd.merge(df, ser.reset_index(), on=['Let', 'Num']) + pd.merge(df, ser.reset_index(), on=["Let", "Num"]) Here is another example with duplicate join keys in DataFrames: .. ipython:: python - left = pd.DataFrame({'A': [1, 2], 'B': [2, 2]}) + left = pd.DataFrame({"A": [1, 2], "B": [2, 2]}) - right = pd.DataFrame({'A': [4, 5, 6], 'B': [2, 2, 2]}) + right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) - result = pd.merge(left, right, on='B', how='outer') + result = pd.merge(left, right, on="B", how="outer") .. ipython:: python :suppress: @@ -784,8 +816,8 @@ In the following example, there are duplicate values of ``B`` in the right .. ipython:: python - left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]}) - right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]}) + left = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) .. code-block:: ipython @@ -799,7 +831,7 @@ ensure there are no duplicates in the left DataFrame, one can use the .. ipython:: python - pd.merge(left, right, on='B', how='outer', validate="one_to_many") + pd.merge(left, right, on="B", how="outer", validate="one_to_many") .. _merging.indicator: @@ -821,15 +853,15 @@ that takes on values: .. ipython:: python - df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']}) - df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]}) - pd.merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]}) + df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) + pd.merge(df1, df2, on="col1", how="outer", indicator=True) The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. .. ipython:: python - pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') + pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column") .. _merging.dtypes: @@ -841,25 +873,25 @@ Merging will preserve the dtype of the join keys. .. ipython:: python - left = pd.DataFrame({'key': [1], 'v1': [10]}) + left = pd.DataFrame({"key": [1], "v1": [10]}) left - right = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + right = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) right We are able to preserve the join keys: .. ipython:: python - pd.merge(left, right, how='outer') - pd.merge(left, right, how='outer').dtypes + pd.merge(left, right, how="outer") + pd.merge(left, right, how="outer").dtypes Of course if you have missing values that are introduced, then the resulting dtype will be upcast. .. ipython:: python - pd.merge(left, right, how='outer', on='key') - pd.merge(left, right, how='outer', on='key').dtypes + pd.merge(left, right, how="outer", on="key") + pd.merge(left, right, how="outer", on="key").dtypes Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals `. @@ -869,12 +901,12 @@ The left frame. from pandas.api.types import CategoricalDtype - X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype(CategoricalDtype(categories=['foo', 'bar'])) + X = pd.Series(np.random.choice(["foo", "bar"], size=(10,))) + X = X.astype(CategoricalDtype(categories=["foo", "bar"])) - left = pd.DataFrame({'X': X, - 'Y': np.random.choice(['one', 'two', 'three'], - size=(10,))}) + left = pd.DataFrame( + {"X": X, "Y": np.random.choice(["one", "two", "three"], size=(10,))} + ) left left.dtypes @@ -882,9 +914,12 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar'], - dtype=CategoricalDtype(['foo', 'bar'])), - 'Z': [1, 2]}) + right = pd.DataFrame( + { + "X": pd.Series(["foo", "bar"], dtype=CategoricalDtype(["foo", "bar"])), + "Z": [1, 2], + } + ) right right.dtypes @@ -892,7 +927,7 @@ The merged result: .. ipython:: python - result = pd.merge(left, right, how='outer') + result = pd.merge(left, right, how="outer") result result.dtypes @@ -916,13 +951,13 @@ potentially differently-indexed ``DataFrames`` into a single result .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=['K0', 'K1', 'K2']) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"] + ) - right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], - 'D': ['D0', 'D2', 'D3']}, - index=['K0', 'K2', 'K3']) + right = pd.DataFrame( + {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"] + ) result = left.join(right) @@ -936,7 +971,7 @@ potentially differently-indexed ``DataFrames`` into a single result .. ipython:: python - result = left.join(right, how='outer') + result = left.join(right, how="outer") .. ipython:: python :suppress: @@ -950,7 +985,7 @@ The same as above, but with ``how='inner'``. .. ipython:: python - result = left.join(right, how='inner') + result = left.join(right, how="inner") .. ipython:: python :suppress: @@ -966,7 +1001,7 @@ indexes: .. ipython:: python - result = pd.merge(left, right, left_index=True, right_index=True, how='outer') + result = pd.merge(left, right, left_index=True, right_index=True, how="outer") .. ipython:: python :suppress: @@ -978,7 +1013,7 @@ indexes: .. ipython:: python - result = pd.merge(left, right, left_index=True, right_index=True, how='inner'); + result = pd.merge(left, right, left_index=True, right_index=True, how="inner") .. ipython:: python :suppress: @@ -1008,15 +1043,17 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key': ['K0', 'K1', 'K0', 'K1']}) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key": ["K0", "K1", "K0", "K1"], + } + ) - right = pd.DataFrame({'C': ['C0', 'C1'], - 'D': ['D0', 'D1']}, - index=['K0', 'K1']) + right = pd.DataFrame({"C": ["C0", "C1"], "D": ["D0", "D1"]}, index=["K0", "K1"]) - result = left.join(right, on='key') + result = left.join(right, on="key") .. ipython:: python :suppress: @@ -1028,8 +1065,7 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - result = pd.merge(left, right, left_on='key', right_index=True, - how='left', sort=False); + result = pd.merge(left, right, left_on="key", right_index=True, how="left", sort=False) .. ipython:: python :suppress: @@ -1045,22 +1081,27 @@ To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key1': ['K0', 'K0', 'K1', 'K2'], - 'key2': ['K0', 'K1', 'K0', 'K1']}) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + } + ) - index = pd.MultiIndex.from_tuples([('K0', 'K0'), ('K1', 'K0'), - ('K2', 'K0'), ('K2', 'K1')]) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=index) + index = pd.MultiIndex.from_tuples( + [("K0", "K0"), ("K1", "K0"), ("K2", "K0"), ("K2", "K1")] + ) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=index + ) Now this can be joined by passing the two key column names: .. ipython:: python - result = left.join(right, on=['key1', 'key2']) + result = left.join(right, on=["key1", "key2"]) .. ipython:: python :suppress: @@ -1079,7 +1120,7 @@ easily performed: .. ipython:: python - result = left.join(right, on=['key1', 'key2'], how='inner') + result = left.join(right, on=["key1", "key2"], how="inner") .. ipython:: python :suppress: @@ -1149,39 +1190,38 @@ the left argument, as in this example: .. ipython:: python - leftindex = pd.MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], - names=['abc', 'xy', 'num']) - left = pd.DataFrame({'v1': range(12)}, index=leftindex) + leftindex = pd.MultiIndex.from_product( + [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"] + ) + left = pd.DataFrame({"v1": range(12)}, index=leftindex) left - rightindex = pd.MultiIndex.from_product([list('abc'), list('xy')], - names=['abc', 'xy']) - right = pd.DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) + rightindex = pd.MultiIndex.from_product([list("abc"), list("xy")], names=["abc", "xy"]) + right = pd.DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) right - left.join(right, on=['abc', 'xy'], how='inner') + left.join(right, on=["abc", "xy"], how="inner") If that condition is not satisfied, a join with two multi-indexes can be done using the following code. .. ipython:: python - leftindex = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=leftindex) + leftindex = pd.MultiIndex.from_tuples( + [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] + ) + left = pd.DataFrame({"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex) - rightindex = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), - ('K2', 'Y2'), ('K2', 'Y3')], - names=['key', 'Y']) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=rightindex) + rightindex = pd.MultiIndex.from_tuples( + [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] + ) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=rightindex + ) - result = pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner').set_index(['key', 'X', 'Y']) + result = pd.merge( + left.reset_index(), right.reset_index(), on=["key"], how="inner" + ).set_index(["key", "X", "Y"]) .. ipython:: python :suppress: @@ -1203,21 +1243,29 @@ resetting indexes. .. ipython:: python - left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + left_index = pd.Index(["K0", "K0", "K1", "K2"], name="key1") - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key2': ['K0', 'K1', 'K0', 'K1']}, - index=left_index) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key2": ["K0", "K1", "K0", "K1"], + }, + index=left_index, + ) - right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + right_index = pd.Index(["K0", "K1", "K2", "K2"], name="key1") - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3'], - 'key2': ['K0', 'K0', 'K0', 'K1']}, - index=right_index) + right = pd.DataFrame( + { + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + "key2": ["K0", "K0", "K0", "K1"], + }, + index=right_index, + ) - result = left.merge(right, on=['key1', 'key2']) + result = left.merge(right, on=["key1", "key2"]) .. ipython:: python :suppress: @@ -1254,10 +1302,10 @@ columns: .. ipython:: python - left = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'v': [1, 2, 3]}) - right = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'v': [4, 5, 6]}) + left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]}) + right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]}) - result = pd.merge(left, right, on='k') + result = pd.merge(left, right, on="k") .. ipython:: python :suppress: @@ -1269,7 +1317,7 @@ columns: .. ipython:: python - result = pd.merge(left, right, on='k', suffixes=('_l', '_r')) + result = pd.merge(left, right, on="k", suffixes=("_l", "_r")) .. ipython:: python :suppress: @@ -1284,9 +1332,9 @@ similarly. .. ipython:: python - left = left.set_index('k') - right = right.set_index('k') - result = left.join(right, lsuffix='_l', rsuffix='_r') + left = left.set_index("k") + right = right.set_index("k") + result = left.join(right, lsuffix="_l", rsuffix="_r") .. ipython:: python :suppress: @@ -1306,7 +1354,7 @@ to join them together on their indexes. .. ipython:: python - right2 = pd.DataFrame({'v': [7, 8, 9]}, index=['K1', 'K1', 'K2']) + right2 = pd.DataFrame({"v": [7, 8, 9]}, index=["K1", "K1", "K2"]) result = left.join([right, right2]) .. ipython:: python @@ -1328,10 +1376,8 @@ one object from values for matching indices in the other. Here is an example: .. ipython:: python - df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan], - [np.nan, 7., np.nan]]) - df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]], - index=[1, 2]) + df1 = pd.DataFrame([[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]]) + df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) For this, use the :meth:`~DataFrame.combine_first` method: @@ -1384,14 +1430,13 @@ fill/interpolate missing data: .. ipython:: python - left = pd.DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], - 'lv': [1, 2, 3, 4], - 's': ['a', 'b', 'c', 'd']}) + left = pd.DataFrame( + {"k": ["K0", "K1", "K1", "K2"], "lv": [1, 2, 3, 4], "s": ["a", "b", "c", "d"]} + ) - right = pd.DataFrame({'k': ['K1', 'K2', 'K4'], - 'rv': [1, 2, 3]}) + right = pd.DataFrame({"k": ["K1", "K2", "K4"], "rv": [1, 2, 3]}) - pd.merge_ordered(left, right, fill_method='ffill', left_by='s') + pd.merge_ordered(left, right, fill_method="ffill", left_by="s") .. _merging.merge_asof: @@ -1411,37 +1456,44 @@ merge them. .. ipython:: python - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL', 'GOOG', - 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) .. ipython:: python @@ -1452,18 +1504,13 @@ By default we are taking the asof of the quotes. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker') + pd.merge_asof(trades, quotes, on="time", by="ticker") We only asof within ``2ms`` between the quote time and the trade time. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('2ms')) + pd.merge_asof(trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")) We only asof within ``10ms`` between the quote time and the trade time and we exclude exact matches on time. Note that though we exclude the exact matches @@ -1471,11 +1518,14 @@ exclude exact matches on time. Note that though we exclude the exact matches .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('10ms'), - allow_exact_matches=False) + pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ) .. _merging.compare: @@ -1496,7 +1546,7 @@ side by side. { "col1": ["a", "a", "b", "b", "a"], "col2": [1.0, 2.0, 3.0, np.nan, 5.0], - "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + "col3": [1.0, 2.0, 3.0, 4.0, 5.0], }, columns=["col1", "col2", "col3"], ) @@ -1505,8 +1555,8 @@ side by side. .. ipython:: python df2 = df.copy() - df2.loc[0, 'col1'] = 'c' - df2.loc[2, 'col3'] = 4.0 + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 df2 .. ipython:: python