diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index c708ebb361ed1..46960140d3a8c 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -34,7 +34,7 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): # verify there is a column latitude and a column longitude - if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + if "latitude" not in obj.columns or "longitude" not in obj.columns: raise AttributeError("Must have 'latitude' and 'longitude'.") @property @@ -176,6 +176,7 @@ your ``MyExtensionArray`` class, as follows: from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin + class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): pass @@ -271,6 +272,7 @@ included as a column in a pandas DataFrame): def __arrow_array__(self, type=None): # convert the underlying array values to a pyarrow Array import pyarrow + return pyarrow.array(..., type=type) The ``ExtensionDtype.__from_arrow__`` method then controls the conversion @@ -347,7 +349,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame .. code-block:: python class SubclassedSeries(pd.Series): - @property def _constructor(self): return SubclassedSeries @@ -358,7 +359,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame class SubclassedDataFrame(pd.DataFrame): - @property def _constructor(self): return SubclassedDataFrame @@ -377,7 +377,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(to_framed) - >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -387,7 +387,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(df) - >>> sliced1 = df[['A', 'B']] + >>> sliced1 = df[["A", "B"]] >>> sliced1 A B 0 1 4 @@ -397,7 +397,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df["A"] >>> sliced2 0 1 1 2 @@ -422,11 +422,11 @@ Below is an example to define two original properties, "internal_cache" as a tem class SubclassedDataFrame2(pd.DataFrame): # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] _internal_names_set = set(_internal_names) # normal properties - _metadata = ['added_property'] + _metadata = ["added_property"] @property def _constructor(self): @@ -434,15 +434,15 @@ Below is an example to define two original properties, "internal_cache" as a tem .. code-block:: python - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 1 2 5 8 2 3 6 9 - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' + >>> df.internal_cache = "cached" + >>> df.added_property = "property" >>> df.internal_cache cached @@ -450,11 +450,11 @@ Below is an example to define two original properties, "internal_cache" as a tem property # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache + >>> df[["A", "B"]].internal_cache AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property + >>> df[["A", "B"]].added_property property .. _extending.plotting-backends: @@ -468,7 +468,7 @@ one based on Matplotlib. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index e7edda90610b5..2f6ac6b06d85e 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -63,8 +63,7 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -73,9 +72,9 @@ in order to have a valid result. .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.cov() @@ -116,13 +115,12 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan # Series with Series - frame['a'].corr(frame['b']) - frame['a'].corr(frame['b'], method='spearman') + frame["a"].corr(frame["b"]) + frame["a"].corr(frame["b"], method="spearman") # Pairwise correlation of DataFrame columns frame.corr() @@ -134,9 +132,9 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.corr() @@ -154,8 +152,8 @@ compute the correlation based on histogram intersection: # histogram intersection def histogram_intersection(a, b): - return np.minimum(np.true_divide(a, a.sum()), - np.true_divide(b, b.sum())).sum() + return np.minimum(np.true_divide(a, a.sum()), np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) @@ -165,8 +163,8 @@ DataFrame objects. .. ipython:: python - index = ['a', 'b', 'c', 'd', 'e'] - columns = ['one', 'two', 'three', 'four'] + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) df1.corrwith(df2) @@ -182,8 +180,8 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python - s = pd.Series(np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s = pd.Series(np.random.randn(5), index=list("abcde")) + s["d"] = s["b"] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -243,8 +241,7 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python - s = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + s = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) s = s.cumsum() s @@ -279,24 +276,26 @@ We can then call methods on these ``rolling`` objects. These return like-indexed .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig rolling_mean_ex.png - r.mean().plot(style='k') + r.mean().plot(style="k") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") They can also be applied to DataFrame objects. This is really just syntactic sugar for applying the moving window operator to all of the DataFrame's columns: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) df = df.cumsum() @savefig rolling_mean_frame.png @@ -368,7 +367,7 @@ compute the mean absolute deviation on a rolling basis: return np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - s.rolling(window=60).apply(mad, raw=True).plot(style='k') + s.rolling(window=60).apply(mad, raw=True).plot(style="k") Using the Numba engine ~~~~~~~~~~~~~~~~~~~~~~ @@ -453,23 +452,22 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python - ser = pd.Series(np.random.randn(10), - index=pd.date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), index=pd.date_range("1/1/2000", periods=10)) - ser.rolling(window=5, win_type='triang').mean() + ser.rolling(window=5, win_type="triang").mean() Note that the ``boxcar`` window is equivalent to :meth:`~Rolling.mean`. .. ipython:: python - ser.rolling(window=5, win_type='boxcar').mean() + ser.rolling(window=5, win_type="boxcar").mean() ser.rolling(window=5).mean() For some windowing functions, additional parameters must be specified: .. ipython:: python - ser.rolling(window=5, win_type='gaussian').mean(std=0.1) + ser.rolling(window=5, win_type="gaussian").mean(std=0.1) .. _stats.moments.normalization: @@ -498,10 +496,10 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', - periods=5, - freq='s')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, freq="s"), + ) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -515,20 +513,26 @@ Specifying an offset allows a more intuitive specification of the rolling freque .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06"), + ], + name="foo", + ), + ) dft dft.rolling(2).sum() @@ -537,7 +541,7 @@ Using the time-specification generates variable windows for this sparse data. .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the default of the index) in a DataFrame. @@ -546,7 +550,7 @@ default of the index) in a DataFrame. dft = dft.reset_index() dft - dft.rolling('2s', on='foo').sum() + dft.rolling("2s", on="foo").sum() .. _stats.custom_rolling_window: @@ -569,7 +573,7 @@ For example, if we have the following ``DataFrame``: use_expanding = [True, False, True, False, True] use_expanding - df = pd.DataFrame({'values': range(5)}) + df = pd.DataFrame({"values": range(5)}) df and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size @@ -615,7 +619,8 @@ rolling operations over a non-fixed offset like a ``BusinessDay``. .. ipython:: python from pandas.api.indexers import VariableOffsetWindowIndexer - df = pd.DataFrame(range(10), index=pd.date_range('2020', periods=10)) + + df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) offset = pd.offsets.BDay(1) indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) df @@ -657,17 +662,21 @@ from present information back to past information. This allows the rolling windo .. ipython:: python - df = pd.DataFrame({'x': 1}, - index=[pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) - - df["right"] = df.rolling('2s', closed='right').x.sum() # default - df["both"] = df.rolling('2s', closed='both').x.sum() - df["left"] = df.rolling('2s', closed='left').x.sum() - df["neither"] = df.rolling('2s', closed='neither').x.sum() + df = pd.DataFrame( + {"x": 1}, + index=[ + pd.Timestamp("20130101 09:00:01"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:04"), + pd.Timestamp("20130101 09:00:06"), + ], + ) + + df["right"] = df.rolling("2s", closed="right").x.sum() # default + df["both"] = df.rolling("2s", closed="both").x.sum() + df["left"] = df.rolling("2s", closed="left").x.sum() + df["neither"] = df.rolling("2s", closed="neither").x.sum() df @@ -745,13 +754,15 @@ For example: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) df = df.cumsum() df2 = df[:20] - df2.rolling(window=5).corr(df2['B']) + df2.rolling(window=5).corr(df2["B"]) .. _stats.moments.corr_pairwise: @@ -776,14 +787,13 @@ can even be omitted: .. ipython:: python - covs = (df[['B', 'C', 'D']].rolling(window=50) - .cov(df[['A', 'B', 'C']], pairwise=True)) - covs.loc['2002-09-22':] + covs = df[["B", "C", "D"]].rolling(window=50).cov(df[["A", "B", "C"]], pairwise=True) + covs.loc["2002-09-22":] .. ipython:: python correls = df.rolling(window=50).corr() - correls.loc['2002-09-22':] + correls.loc["2002-09-22":] You can efficiently retrieve the time series of correlations between two columns by reshaping and indexing: @@ -791,12 +801,12 @@ columns by reshaping and indexing: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls.unstack(1)[('A', 'C')].plot() + correls.unstack(1)[("A", "C")].plot() .. _stats.aggregate: @@ -810,9 +820,11 @@ perform multiple computations on the data. These operations are similar to the : .. ipython:: python - dfa = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + dfa = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) r = dfa.rolling(window=60, min_periods=1) r @@ -823,9 +835,9 @@ Series (or multiple Series) via standard ``__getitem__``. r.aggregate(np.sum) - r['A'].aggregate(np.sum) + r["A"].aggregate(np.sum) - r[['A', 'B']].aggregate(np.sum) + r[["A", "B"]].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -840,7 +852,7 @@ aggregation with, outputting a DataFrame: .. ipython:: python - r['A'].agg([np.sum, np.mean, np.std]) + r["A"].agg([np.sum, np.mean, np.std]) On a windowed DataFrame, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -860,20 +872,20 @@ columns of a ``DataFrame``: .. ipython:: python - r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A': 'sum', 'B': 'std'}) + r.agg({"A": "sum", "B": "std"}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) .. _stats.moments.expanding: @@ -967,7 +979,7 @@ all accept are: sn.expanding().sum() sn.cumsum() - sn.cumsum().fillna(method='ffill') + sn.cumsum().fillna(method="ffill") An expanding window statistic will be more stable (and less responsive) than @@ -978,14 +990,14 @@ relative impact of an individual data point. As an example, here is the .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig expanding_mean_frame.png - s.expanding().mean().plot(style='k') + s.expanding().mean().plot(style="k") .. _stats.moments.exponentially_weighted: @@ -1115,10 +1127,10 @@ of ``times``. .. ipython:: python - df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) df - times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] - df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"] + df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean() The following formula is used to compute exponentially weighted mean with an input vector of times: @@ -1130,10 +1142,10 @@ Here is an example for a univariate time series: .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig ewma_ex.png - s.ewm(span=20).mean().plot(style='k') + s.ewm(span=20).mean().plot(style="k") ExponentialMovingWindow has a ``min_periods`` argument, which has the same meaning it does for all the ``.expanding`` and ``.rolling`` methods: diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index c27c73d439a0c..d698b316d321e 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -51,7 +51,7 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s s.index @@ -71,7 +71,7 @@ Series can be instantiated from dicts: .. ipython:: python - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} pd.Series(d) .. note:: @@ -92,9 +92,9 @@ index will be pulled out. .. ipython:: python - d = {'a': 0., 'b': 1., 'c': 2.} + d = {"a": 0.0, "b": 1.0, "c": 2.0} pd.Series(d) - pd.Series(d, index=['b', 'c', 'd', 'a']) + pd.Series(d, index=["b", "c", "d", "a"]) .. note:: @@ -107,7 +107,7 @@ provided. The value will be repeated to match the length of **index**. .. ipython:: python - pd.Series(5., index=['a', 'b', 'c', 'd', 'e']) + pd.Series(5.0, index=["a", "b", "c", "d", "e"]) Series is ndarray-like ~~~~~~~~~~~~~~~~~~~~~~ @@ -173,26 +173,26 @@ label: .. ipython:: python - s['a'] - s['e'] = 12. + s["a"] + s["e"] = 12.0 s - 'e' in s - 'f' in s + "e" in s + "f" in s If a label is not contained, an exception is raised: .. code-block:: python - >>> s['f'] + >>> s["f"] KeyError: 'f' Using the ``get`` method, a missing label will return None or specified default: .. ipython:: python - s.get('f') + s.get("f") - s.get('f', np.nan) + s.get("f", np.nan) See also the :ref:`section on attribute access`. @@ -244,7 +244,7 @@ Series can also have a ``name`` attribute: .. ipython:: python - s = pd.Series(np.random.randn(5), name='something') + s = pd.Series(np.random.randn(5), name="something") s s.name @@ -306,13 +306,15 @@ keys. .. ipython:: python - d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), - 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + d = { + "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), + "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]), + } df = pd.DataFrame(d) df - pd.DataFrame(d, index=['d', 'b', 'a']) - pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three']) + pd.DataFrame(d, index=["d", "b", "a"]) + pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"]) The row and column labels can be accessed respectively by accessing the **index** and **columns** attributes: @@ -336,10 +338,9 @@ result will be ``range(n)``, where ``n`` is the array length. .. ipython:: python - d = {'one': [1., 2., 3., 4.], - 'two': [4., 3., 2., 1.]} + d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} pd.DataFrame(d) - pd.DataFrame(d, index=['a', 'b', 'c', 'd']) + pd.DataFrame(d, index=["a", "b", "c", "d"]) From structured or record array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -348,12 +349,12 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) - data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) - pd.DataFrame(data, index=['first', 'second']) - pd.DataFrame(data, columns=['C', 'A', 'B']) + pd.DataFrame(data, index=["first", "second"]) + pd.DataFrame(data, columns=["C", "A", "B"]) .. note:: @@ -367,10 +368,10 @@ From a list of dicts .. ipython:: python - data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] + data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}] pd.DataFrame(data2) - pd.DataFrame(data2, index=['first', 'second']) - pd.DataFrame(data2, columns=['a', 'b']) + pd.DataFrame(data2, index=["first", "second"]) + pd.DataFrame(data2, columns=["a", "b"]) .. _basics.dataframe.from_dict_of_tuples: @@ -382,11 +383,15 @@ dictionary. .. ipython:: python - pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, - ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, - ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, - ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, - ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + pd.DataFrame( + { + ("a", "b"): {("A", "B"): 1, ("A", "C"): 2}, + ("a", "a"): {("A", "C"): 3, ("A", "B"): 4}, + ("a", "c"): {("A", "B"): 5, ("A", "C"): 6}, + ("b", "a"): {("A", "C"): 7, ("A", "B"): 8}, + ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}, + } + ) .. _basics.dataframe.from_series: @@ -414,11 +419,11 @@ first ``namedtuple``, a ``ValueError`` is raised. from collections import namedtuple - Point = namedtuple('Point', 'x y') + Point = namedtuple("Point", "x y") pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]) - Point3D = namedtuple('Point3D', 'x y z') + Point3D = namedtuple("Point3D", "x y z") pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]) @@ -468,15 +473,18 @@ set to ``'index'`` in order to use the dict keys as row labels. .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])])) + pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])) If you pass ``orient='index'``, the keys will be the row labels. In this case, you can also pass the desired column names: .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]), - orient='index', columns=['one', 'two', 'three']) + pd.DataFrame.from_dict( + dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]), + orient="index", + columns=["one", "two", "three"], + ) .. _basics.dataframe.from_records: @@ -490,7 +498,7 @@ dtype. For example: .. ipython:: python data - pd.DataFrame.from_records(data, index='C') + pd.DataFrame.from_records(data, index="C") .. _basics.dataframe.sel_add_del: @@ -503,17 +511,17 @@ the analogous dict operations: .. ipython:: python - df['one'] - df['three'] = df['one'] * df['two'] - df['flag'] = df['one'] > 2 + df["one"] + df["three"] = df["one"] * df["two"] + df["flag"] = df["one"] > 2 df Columns can be deleted or popped like with a dict: .. ipython:: python - del df['two'] - three = df.pop('three') + del df["two"] + three = df.pop("three") df When inserting a scalar value, it will naturally be propagated to fill the @@ -521,7 +529,7 @@ column: .. ipython:: python - df['foo'] = 'bar' + df["foo"] = "bar" df When inserting a Series that does not have the same index as the DataFrame, it @@ -529,7 +537,7 @@ will be conformed to the DataFrame's index: .. ipython:: python - df['one_trunc'] = df['one'][:2] + df["one_trunc"] = df["one"][:2] df You can insert raw ndarrays but their length must match the length of the @@ -540,7 +548,7 @@ available to insert at a particular location in the columns: .. ipython:: python - df.insert(1, 'bar', df['one']) + df.insert(1, "bar", df["one"]) df .. _dsintro.chained_assignment: @@ -556,17 +564,16 @@ derived from existing columns. .. ipython:: python - iris = pd.read_csv('data/iris.data') + iris = pd.read_csv("data/iris.data") iris.head() - (iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength']) - .head()) + iris.assign(sepal_ratio=iris["SepalWidth"] / iris["SepalLength"]).head() In the example above, we inserted a precomputed value. We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to. .. ipython:: python - iris.assign(sepal_ratio=lambda x: (x['SepalWidth'] / x['SepalLength'])).head() + iris.assign(sepal_ratio=lambda x: (x["SepalWidth"] / x["SepalLength"])).head() ``assign`` **always** returns a copy of the data, leaving the original DataFrame untouched. @@ -580,10 +587,14 @@ greater than 5, calculate the ratio, and plot: .. ipython:: python @savefig basics_assign.png - (iris.query('SepalLength > 5') - .assign(SepalRatio=lambda x: x.SepalWidth / x.SepalLength, - PetalRatio=lambda x: x.PetalWidth / x.PetalLength) - .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) + ( + iris.query("SepalLength > 5") + .assign( + SepalRatio=lambda x: x.SepalWidth / x.SepalLength, + PetalRatio=lambda x: x.PetalWidth / x.PetalLength, + ) + .plot(kind="scatter", x="SepalRatio", y="PetalRatio") + ) Since a function is passed in, the function is computed on the DataFrame being assigned to. Importantly, this is the DataFrame that's been filtered @@ -603,10 +614,8 @@ to a column created earlier in the same :meth:`~DataFrame.assign`. .. ipython:: python - dfa = pd.DataFrame({"A": [1, 2, 3], - "B": [4, 5, 6]}) - dfa.assign(C=lambda x: x['A'] + x['B'], - D=lambda x: x['A'] + x['C']) + dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"]) In the second expression, ``x['C']`` will refer to the newly created column, that's equal to ``dfa['A'] + dfa['B']``. @@ -631,7 +640,7 @@ DataFrame: .. ipython:: python - df.loc['b'] + df.loc["b"] df.iloc[2] For a more exhaustive treatment of sophisticated label-based indexing and @@ -650,8 +659,8 @@ union of the column and row labels. .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C']) + df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"]) df + df2 When doing an operation between DataFrame and Series, the default behavior is @@ -680,8 +689,8 @@ Boolean operators work as well: .. ipython:: python - df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool) - df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool) + df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool) + df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool) df1 & df2 df1 | df2 df1 ^ df2 @@ -737,8 +746,8 @@ on two :class:`Series` with differently ordered labels will align before the ope .. ipython:: python - ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c']) + ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + ser2 = pd.Series([1, 3, 5], index=["b", "a", "c"]) ser1 ser2 np.remainder(ser1, ser2) @@ -748,7 +757,7 @@ with missing values. .. ipython:: python - ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd']) + ser3 = pd.Series([2, 4, 6], index=["b", "c", "d"]) ser3 np.remainder(ser1, ser3) @@ -778,11 +787,11 @@ R package): :suppress: # force a summary to be printed - pd.set_option('display.max_rows', 5) + pd.set_option("display.max_rows", 5) .. ipython:: python - baseball = pd.read_csv('data/baseball.csv') + baseball = pd.read_csv("data/baseball.csv") print(baseball) baseball.info() @@ -791,7 +800,7 @@ R package): :okwarning: # restore GlobalPrintConfig - pd.reset_option(r'^display\.') + pd.reset_option(r"^display\.") However, using ``to_string`` will return a string representation of the DataFrame in tabular form, though it won't always fit the console width: @@ -812,7 +821,7 @@ option: .. ipython:: python - pd.set_option('display.width', 40) # default is 80 + pd.set_option("display.width", 40) # default is 80 pd.DataFrame(np.random.randn(3, 12)) @@ -820,21 +829,25 @@ You can adjust the max width of the individual columns by setting ``display.max_ .. ipython:: python - datafile = {'filename': ['filename_01', 'filename_02'], - 'path': ["media/user_name/storage/folder_01/filename_01", - "media/user_name/storage/folder_02/filename_02"]} + datafile = { + "filename": ["filename_01", "filename_02"], + "path": [ + "media/user_name/storage/folder_01/filename_01", + "media/user_name/storage/folder_02/filename_02", + ], + } - pd.set_option('display.max_colwidth', 30) + pd.set_option("display.max_colwidth", 30) pd.DataFrame(datafile) - pd.set_option('display.max_colwidth', 100) + pd.set_option("display.max_colwidth", 100) pd.DataFrame(datafile) .. ipython:: python :suppress: - pd.reset_option('display.width') - pd.reset_option('display.max_colwidth') + pd.reset_option("display.width") + pd.reset_option("display.max_colwidth") You can also disable this feature via the ``expand_frame_repr`` option. This will print the table in one block. @@ -847,8 +860,7 @@ accessed like an attribute: .. ipython:: python - df = pd.DataFrame({'foo1': np.random.randn(5), - 'foo2': np.random.randn(5)}) + df = pd.DataFrame({"foo1": np.random.randn(5), "foo2": np.random.randn(5)}) df df.foo1 diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index f41912445455d..46ab29a52747a 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -11,7 +11,8 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") We provide the basics in pandas to easily create decent looking plots. See the :ref:`ecosystem ` section for visualization @@ -39,8 +40,7 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around .. ipython:: python - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -54,18 +54,17 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list("ABCD")) df = df.cumsum() plt.figure(); @savefig frame_plot_basic.png - df.plot(); + df.plot() You can plot one column versus another using the ``x`` and ``y`` keywords in :meth:`~DataFrame.plot`: @@ -73,17 +72,17 @@ You can plot one column versus another using the ``x`` and ``y`` keywords in .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df3 = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() - df3['A'] = pd.Series(list(range(len(df)))) + df3 = pd.DataFrame(np.random.randn(1000, 2), columns=["B", "C"]).cumsum() + df3["A"] = pd.Series(list(range(len(df)))) @savefig df_plot_xy.png - df3.plot(x='A', y='B') + df3.plot(x="A", y="B") .. note:: @@ -93,7 +92,7 @@ You can plot one column versus another using the ``x`` and ``y`` keywords in .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.other: @@ -120,7 +119,7 @@ For example, a bar plot can be created the following way: plt.figure(); @savefig bar_plot_ex.png - df.iloc[5].plot(kind='bar'); + df.iloc[5].plot(kind="bar") You can also create these other plots using the methods ``DataFrame.plot.`` instead of providing the ``kind`` keyword argument. This makes it easier to discover plot methods and the specific arguments they use: @@ -164,7 +163,7 @@ For labeled, non-time series data, you may wish to produce a bar plot: @savefig bar_plot_ex.png df.iloc[5].plot.bar() - plt.axhline(0, color='k'); + plt.axhline(0, color="k") Calling a DataFrame's :meth:`plot.bar() ` method produces a multiple bar plot: @@ -172,42 +171,42 @@ bar plot: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig bar_plot_multi_ex.png - df2.plot.bar(); + df2.plot.bar() To produce a stacked bar plot, pass ``stacked=True``: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig bar_plot_stacked_ex.png - df2.plot.bar(stacked=True); + df2.plot.bar(stacked=True) To get horizontal bar plots, use the ``barh`` method: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig barh_plot_stacked_ex.png - df2.plot.barh(stacked=True); + df2.plot.barh(stacked=True) .. _visualization.hist: @@ -218,8 +217,14 @@ Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Seri .. ipython:: python - df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000), - 'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c']) + df4 = pd.DataFrame( + { + "a": np.random.randn(1000) + 1, + "b": np.random.randn(1000), + "c": np.random.randn(1000) - 1, + }, + columns=["a", "b", "c"], + ) plt.figure(); @@ -230,7 +235,7 @@ Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Seri .. ipython:: python :suppress: - plt.close('all') + plt.close("all") A histogram can be stacked using ``stacked=True``. Bin size can be changed using the ``bins`` keyword. @@ -245,7 +250,7 @@ using the ``bins`` keyword. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib ``hist``. For example, horizontal and cumulative histograms can be drawn by @@ -256,12 +261,12 @@ horizontal and cumulative histograms can be drawn by plt.figure(); @savefig hist_new_kwargs.png - df4['a'].plot.hist(orientation='horizontal', cumulative=True) + df4["a"].plot.hist(orientation="horizontal", cumulative=True) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hist ` method and the `matplotlib hist documentation `__ for more. @@ -274,12 +279,12 @@ The existing interface ``DataFrame.hist`` to plot histogram still can be used. plt.figure(); @savefig hist_plot_ex.png - df['A'].diff().hist() + df["A"].diff().hist() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") :meth:`DataFrame.hist` plots the histograms of the columns on multiple subplots: @@ -289,7 +294,7 @@ subplots: plt.figure() @savefig frame_hist_ex.png - df.diff().hist(color='k', alpha=0.5, bins=50) + df.diff().hist(color="k", alpha=0.5, bins=50) The ``by`` keyword can be specified to plot grouped histograms: @@ -297,7 +302,7 @@ The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) @@ -323,12 +328,12 @@ a uniform random variable on [0,1). .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + df = pd.DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) @savefig box_plot_new.png df.plot.box() @@ -348,16 +353,20 @@ more complicated colorization, you can get each drawn artists by passing .. ipython:: python - color = {'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', - 'medians': 'DarkBlue', 'caps': 'Gray'} + color = { + "boxes": "DarkGreen", + "whiskers": "DarkOrange", + "medians": "DarkBlue", + "caps": "Gray", + } @savefig box_new_colorize.png - df.plot.box(color=color, sym='r+') + df.plot.box(color=color, sym="r+") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Also, you can pass other keywords supported by matplotlib ``boxplot``. For example, horizontal and custom-positioned boxplot can be drawn by @@ -378,7 +387,7 @@ The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python @@ -396,19 +405,19 @@ groupings. For instance, .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df = pd.DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) - plt.figure(); + plt.figure() @savefig box_plot_ex2.png - bp = df.boxplot(by='X') + bp = df.boxplot(by="X") You can also pass a subset of columns to plot, as well as group by multiple columns: @@ -416,25 +425,25 @@ columns: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 3), columns=['Col1', 'Col2', 'Col3']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']) + df = pd.DataFrame(np.random.rand(10, 3), columns=["Col1", "Col2", "Col3"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"]) plt.figure(); @savefig box_plot_ex3.png - bp = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) + bp = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"]) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.box.return: @@ -462,16 +471,16 @@ keyword, will affect the output type as well: np.random.seed(1234) df_box = pd.DataFrame(np.random.randn(50, 2)) - df_box['g'] = np.random.choice(['A', 'B'], size=50) - df_box.loc[df_box['g'] == 'B', 1] += 3 + df_box["g"] = np.random.choice(["A", "B"], size=50) + df_box.loc[df_box["g"] == "B", 1] += 3 @savefig boxplot_groupby.png - bp = df_box.boxplot(by='g') + bp = df_box.boxplot(by="g") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The subplots above are split by the numeric columns first, then the value of the ``g`` column. Below the subplots are first split by the value of ``g``, @@ -481,12 +490,12 @@ then by the numeric columns. :okwarning: @savefig groupby_boxplot_vis.png - bp = df_box.groupby('g').boxplot() + bp = df_box.groupby("g").boxplot() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.area_plot: @@ -506,23 +515,23 @@ When input data contains ``NaN``, it will be automatically filled by 0. If you w .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig area_plot_stacked.png - df.plot.area(); + df.plot.area() To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 unless otherwise specified: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig area_plot_unstacked.png - df.plot.area(stacked=False); + df.plot.area(stacked=False) .. _visualization.scatter: @@ -537,29 +546,29 @@ These can be specified by the ``x`` and ``y`` keywords. :suppress: np.random.seed(123456) - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python - df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(50, 4), columns=["a", "b", "c", "d"]) @savefig scatter_plot.png - df.plot.scatter(x='a', y='b'); + df.plot.scatter(x="a", y="b") To plot multiple column groups in a single axes, repeat ``plot`` method specifying target ``ax``. It is recommended to specify ``color`` and ``label`` keywords to distinguish each groups. .. ipython:: python - ax = df.plot.scatter(x='a', y='b', color='DarkBlue', label='Group 1'); + ax = df.plot.scatter(x="a", y="b", color="DarkBlue", label="Group 1") @savefig scatter_plot_repeated.png - df.plot.scatter(x='c', y='d', color='DarkGreen', label='Group 2', ax=ax); + df.plot.scatter(x="c", y="d", color="DarkGreen", label="Group 2", ax=ax) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The keyword ``c`` may be given as the name of a column to provide colors for each point: @@ -567,13 +576,13 @@ each point: .. ipython:: python @savefig scatter_plot_colored.png - df.plot.scatter(x='a', y='b', c='c', s=50); + df.plot.scatter(x="a", y="b", c="c", s=50) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib :meth:`scatter `. The example below shows a @@ -582,12 +591,12 @@ bubble chart using a column of the ``DataFrame`` as the bubble size. .. ipython:: python @savefig scatter_plot_bubble.png - df.plot.scatter(x='a', y='b', s=df['c'] * 200); + df.plot.scatter(x="a", y="b", s=df["c"] * 200) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`scatter ` method and the `matplotlib scatter documentation `__ for more. @@ -609,11 +618,11 @@ too dense to plot each point individually. .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] + np.arange(1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] + np.arange(1000) @savefig hexbin_plot.png - df.plot.hexbin(x='a', y='b', gridsize=25) + df.plot.hexbin(x="a", y="b", gridsize=25) A useful keyword argument is ``gridsize``; it controls the number of hexagons @@ -631,23 +640,23 @@ given by column ``z``. The bins are aggregated with NumPy's ``max`` function. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] = df['b'] + np.arange(1000) - df['z'] = np.random.uniform(0, 3, 1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] = df["b"] + np.arange(1000) + df["z"] = np.random.uniform(0, 3, 1000) @savefig hexbin_plot_agg.png - df.plot.hexbin(x='a', y='b', C='z', reduce_C_function=np.max, gridsize=25) + df.plot.hexbin(x="a", y="b", C="z", reduce_C_function=np.max, gridsize=25) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hexbin ` method and the `matplotlib hexbin documentation `__ for more. @@ -670,8 +679,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python :okwarning: - series = pd.Series(3 * np.random.rand(4), - index=['a', 'b', 'c', 'd'], name='series') + series = pd.Series(3 * np.random.rand(4), index=["a", "b", "c", "d"], name="series") @savefig series_pie_plot.png series.plot.pie(figsize=(6, 6)) @@ -679,7 +687,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") For pie plots it's best to use square figures, i.e. a figure aspect ratio 1. You can create the figure with equal width and height, or force the aspect ratio @@ -700,8 +708,9 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python - df = pd.DataFrame(3 * np.random.rand(4, 2), - index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + df = pd.DataFrame( + 3 * np.random.rand(4, 2), index=["a", "b", "c", "d"], columns=["x", "y"] + ) @savefig df_pie_plot.png df.plot.pie(subplots=True, figsize=(8, 4)) @@ -709,7 +718,7 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can use the ``labels`` and ``colors`` keywords to specify the labels and colors of each wedge. @@ -731,21 +740,26 @@ Also, other keywords supported by :func:`matplotlib.pyplot.pie` can be used. .. ipython:: python @savefig series_pie_plot_options.png - series.plot.pie(labels=['AA', 'BB', 'CC', 'DD'], colors=['r', 'g', 'b', 'c'], - autopct='%.2f', fontsize=20, figsize=(6, 6)) + series.plot.pie( + labels=["AA", "BB", "CC", "DD"], + colors=["r", "g", "b", "c"], + autopct="%.2f", + fontsize=20, + figsize=(6, 6), + ) If you pass values whose sum total is less than 1.0, matplotlib draws a semicircle. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python :okwarning: - series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') + series = pd.Series([0.1] * 4, index=["a", "b", "c", "d"], name="series2") @savefig series_pie_plot_semi.png series.plot.pie(figsize=(6, 6)) @@ -755,7 +769,7 @@ See the `matplotlib pie documentation `__ for more. @@ -1560,12 +1574,12 @@ To use the cubehelix colormap, we can pass ``colormap='cubehelix'``. plt.figure() @savefig cubehelix.png - df.plot(colormap='cubehelix') + df.plot(colormap="cubehelix") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Alternatively, we can pass the colormap itself: @@ -1581,7 +1595,7 @@ Alternatively, we can pass the colormap itself: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Colormaps can also be used other plot types, like bar charts: @@ -1598,12 +1612,12 @@ Colormaps can also be used other plot types, like bar charts: plt.figure() @savefig greens.png - dd.plot.bar(colormap='Greens') + dd.plot.bar(colormap="Greens") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Parallel coordinates charts: @@ -1612,12 +1626,12 @@ Parallel coordinates charts: plt.figure() @savefig parallel_gist_rainbow.png - parallel_coordinates(data, 'Name', colormap='gist_rainbow') + parallel_coordinates(data, "Name", colormap="gist_rainbow") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Andrews curves charts: @@ -1626,12 +1640,12 @@ Andrews curves charts: plt.figure() @savefig andrews_curve_winter.png - andrews_curves(data, 'Name', colormap='winter') + andrews_curves(data, "Name", colormap="winter") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting directly with matplotlib --------------------------------- @@ -1655,23 +1669,24 @@ when plotting a large number of points. .. ipython:: python - price = pd.Series(np.random.randn(150).cumsum(), - index=pd.date_range('2000-1-1', periods=150, freq='B')) + price = pd.Series( + np.random.randn(150).cumsum(), + index=pd.date_range("2000-1-1", periods=150, freq="B"), + ) ma = price.rolling(20).mean() mstd = price.rolling(20).std() plt.figure() - plt.plot(price.index, price, 'k') - plt.plot(ma.index, ma, 'b') + plt.plot(price.index, price, "k") + plt.plot(ma.index, ma, "b") @savefig bollinger.png - plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, - color='b', alpha=0.2) + plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, color="b", alpha=0.2) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting backends ----------------- @@ -1685,21 +1700,21 @@ function. For example: .. code-block:: python - >>> Series([1, 2, 3]).plot(backend='backend.module') + >>> Series([1, 2, 3]).plot(backend="backend.module") Alternatively, you can also set this option globally, do you don't need to specify the keyword in each ``plot`` call. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() Or: .. code-block:: python - >>> pd.options.plotting.backend = 'backend.module' + >>> pd.options.plotting.backend = "backend.module" >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: