From 0f76a2a3e39c057dc2cd1d06fa468aaeab3e7198 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 08:15:54 -0500 Subject: [PATCH 01/10] DOC: Fix code block line length --- doc/source/user_guide/categorical.rst | 20 ++++++-- doc/source/user_guide/io.rst | 68 ++++++++++++++++++++++----- doc/source/user_guide/reshaping.rst | 38 +++++++++++++-- doc/source/user_guide/text.rst | 23 +++++++-- doc/source/user_guide/timedeltas.rst | 5 +- setup.cfg | 2 +- 6 files changed, 130 insertions(+), 26 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 6a8e1767ef7e8..06dd7438da3e2 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -513,7 +513,11 @@ The ordering of the categorical is determined by the ``categories`` of that colu dfs = pd.DataFrame( { - "A": pd.Categorical(list("bbeebbaa"), categories=["e", "a", "b"], ordered=True), + "A": pd.Categorical( + list("bbeebbaa"), + categories=["e", "a", "b"], + ordered=True, + ), "B": [1, 2, 1, 2, 2, 1, 2, 1], } ) @@ -642,7 +646,13 @@ Groupby will also show "unused" categories: df.groupby("cats").mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df2 = pd.DataFrame({"cats": cats2, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) + df2 = pd.DataFrame( + { + "cats": cats2, + "B": ["c", "d", "c", "d"], + "values": [1, 2, 3, 4], + } + ) df2.groupby(["cats", "B"]).mean() @@ -1115,7 +1125,11 @@ You can use ``fillna`` to handle missing values before applying a function. .. ipython:: python df = pd.DataFrame( - {"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], "cats": pd.Categorical([1, 2, 3, 2])} + { + "a": [1, 2, 3, 4], + "b": ["a", "b", "c", "d"], + "cats": pd.Categorical([1, 2, 3, 2]) + } ) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e483cebf71614..8c7a5801606b1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -986,7 +986,12 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With .. ipython:: python # Try to infer the format for the index column - df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) + df = pd.read_csv( + "foo.csv", + index_col=0, + parse_dates=True, + infer_datetime_format=True, + ) df .. ipython:: python @@ -1046,9 +1051,19 @@ writing to a file). For example: val = "0.3066101993807095471566981359501369297504425048828125" data = "a,b,c\n1,2,{0}".format(val) - abs(pd.read_csv(StringIO(data), engine="c", float_precision=None)["c"][0] - float(val)) abs( - pd.read_csv(StringIO(data), engine="c", float_precision="high")["c"][0] - float(val) + pd.read_csv( + StringIO(data), + engine="c", + float_precision=None + )["c"][0] - float(val) + ) + abs( + pd.read_csv( + StringIO(data), + engine="c", + float_precision="high" + )["c"][0] - float(val) ) abs( pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0] @@ -2517,7 +2532,12 @@ columns to strings. .. code-block:: python url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code" - dfs = pd.read_html(url_mcc, match="Telekom Albania", header=0, converters={"MNC": str}) + dfs = pd.read_html( + url_mcc, + match="Telekom Albania", + header=0, + converters={"MNC": str}, + ) Use some combination of the above: @@ -3570,7 +3590,12 @@ HDFStore will by default not drop rows that are all missing. This behavior can b .. ipython:: python - df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame( + { + "col1": [0, np.nan, 2], + "col2": [1, np.nan, np.nan] + } + ) df_with_missing df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") @@ -3944,7 +3969,8 @@ specified in the format: ``()``, where float may be signed (and fra { "A": pd.Timestamp("20130101"), "B": [ - pd.Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) + pd.Timestamp("20130101") + timedelta(days=i, seconds=10) + for i in range(10) ], } ) @@ -4241,7 +4267,11 @@ results. store.select("df2_mt") # as a multiple - store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") + store.select_as_multiple( + ["df1_mt", "df2_mt"], + where=["A>0", "B>0"], + selector="df1_mt", + ) Delete from a table @@ -4797,8 +4827,16 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet("example_fp.parquet", engine="fastparquet", columns=["a", "b"]) - result = pd.read_parquet("example_pa.parquet", engine="pyarrow", columns=["a", "b"]) + result = pd.read_parquet( + "example_fp.parquet", + engine="fastparquet", + columns=["a", "b"], + ) + result = pd.read_parquet( + "example_pa.parquet", + engine="pyarrow", + columns=["a", "b"], + ) result.dtypes @@ -5176,7 +5214,11 @@ to pass to :func:`pandas.to_datetime`: .. code-block:: python pd.read_sql_table("data", engine, parse_dates={"Date": "%Y-%m-%d"}) - pd.read_sql_table("data", engine, parse_dates={"Date": {"format": "%Y-%m-%d %H:%M:%S"}}) + pd.read_sql_table( + "data", + engine, + parse_dates={"Date": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) You can check if a table exists using :func:`~pandas.io.sql.has_table` @@ -5593,7 +5635,11 @@ avoid converting categorical columns into ``pd.Categorical``: .. code-block:: python - df = pd.read_spss("spss_data.sav", usecols=["foo", "bar"], convert_categoricals=False) + df = pd.read_spss( + "spss_data.sav", + usecols=["foo", "bar"], + convert_categoricals=False, + ) More information about the SAV and ZSAV file formats is available here_. diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 2061185b25416..315f19723de63 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -238,7 +238,13 @@ calling ``sort_index``, of course). Here is a more complex example: .. ipython:: python columns = pd.MultiIndex.from_tuples( - [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")], names=["exp", "animal"] + [ + ("A", "cat"), + ("B", "dog"), + ("B", "cat"), + ("A", "dog"), + ], + names=["exp", "animal"], ) index = pd.MultiIndex.from_product( [("bar", "baz", "foo", "qux"), ("one", "two")], names=["first", "second"] @@ -800,14 +806,26 @@ parameter. .. ipython:: python - df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean", fill_value=0) + df.pivot_table( + values="val0", + index="row", + columns="col", + aggfunc="mean", + fill_value=0, + ) Also note that we can pass in other aggregation functions as well. For example, we can also pass in ``sum``. .. ipython:: python - df.pivot_table(values="val0", index="row", columns="col", aggfunc="sum", fill_value=0) + df.pivot_table( + values="val0", + index="row", + columns="col", + aggfunc="sum", + fill_value=0, + ) Another aggregation we can do is calculate the frequency in which the columns and rows occur together a.k.a. "cross tabulation". To do this, we can pass @@ -825,7 +843,12 @@ We can also perform multiple aggregations. For example, to perform both a .. ipython:: python - df.pivot_table(values="val0", index="row", columns="col", aggfunc=["mean", "sum"]) + df.pivot_table( + values="val0", + index="row", + columns="col", + aggfunc=["mean", "sum"], + ) Note to aggregate over multiple value columns, we can pass in a list to the ``values`` parameter. @@ -839,7 +862,12 @@ Note to subdivide over multiple columns we can pass in a list to the .. ipython:: python - df.pivot_table(values=["val0"], index="row", columns=["item", "col"], aggfunc=["mean"]) + df.pivot_table( + values=["val0"], + index="row", + columns=["item", "col"], + aggfunc=["mean"], + ) .. _reshaping.explode: diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2ada09117273d..1dd567cabd1e2 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -261,7 +261,8 @@ i.e., from the end of the string to the beginning of the string: .. ipython:: python s3 = pd.Series( - ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype="string" + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype="string", ) s3 s3.str.replace("^.a|dog", "XX-XX ", case=False) @@ -515,7 +516,10 @@ DataFrame with one column per group. .. ipython:: python - pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"([ab])(\d)", expand=False) + pd.Series( + ["a1", "b2", "c3"], + dtype="string", + ).str.extract(r"([ab])(\d)", expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -536,7 +540,10 @@ and optional groups like .. ipython:: python - pd.Series(["a1", "b2", "3"], dtype="string").str.extract(r"([ab])?(\d)", expand=False) + pd.Series( + ["a1", "b2", "3"], + dtype="string", + ).str.extract(r"([ab])?(\d)", expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -661,13 +668,19 @@ Or whether elements match a pattern: .. ipython:: python - pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.match(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.match(pattern) .. versionadded:: 1.1.0 .. ipython:: python - pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.fullmatch(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.fullmatch(pattern) .. note:: diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 971a415088220..91e694626fe5a 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -409,7 +409,10 @@ Similarly to other of the datetime-like indices, ``DatetimeIndex`` and ``PeriodI .. ipython:: python - s = pd.Series(np.arange(100), index=pd.timedelta_range("1 days", periods=100, freq="h")) + s = pd.Series( + np.arange(100), + index=pd.timedelta_range("1 days", periods=100, freq="h"), + ) s Selections work similarly, with coercion on string-likes and slices: diff --git a/setup.cfg b/setup.cfg index 73986f692b6cd..5a3ad3f0f95b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,7 @@ exclude = env # exclude asv benchmark environments from linting [flake8-rst] -max-line-length = 88 +max-line-length = 84 bootstrap = import numpy as np import pandas as pd From 94ba01018cbe645c54fc39ccb52cb70dc7280d5b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 08:24:20 -0500 Subject: [PATCH 02/10] Fix --- doc/source/user_guide/categorical.rst | 2 +- doc/source/user_guide/io.rst | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 06dd7438da3e2..ad7a5a994f1f6 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1128,7 +1128,7 @@ You can use ``fillna`` to handle missing values before applying a function. { "a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], - "cats": pd.Categorical([1, 2, 3, 2]) + "cats": pd.Categorical([1, 2, 3, 2]), } ) df.apply(lambda row: type(row["cats"]), axis=1) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 8c7a5801606b1..8458f2e91610e 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1055,14 +1055,14 @@ writing to a file). For example: pd.read_csv( StringIO(data), engine="c", - float_precision=None + float_precision=None, )["c"][0] - float(val) ) abs( pd.read_csv( StringIO(data), engine="c", - float_precision="high" + float_precision="high", )["c"][0] - float(val) ) abs( @@ -3593,7 +3593,7 @@ HDFStore will by default not drop rows that are all missing. This behavior can b df_with_missing = pd.DataFrame( { "col1": [0, np.nan, 2], - "col2": [1, np.nan, np.nan] + "col2": [1, np.nan, np.nan], } ) df_with_missing From 5b5268e69e1592deca9f88f6ab75f0f5966ca4e2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 09:10:08 -0500 Subject: [PATCH 03/10] Fix --- doc/source/user_guide/reshaping.rst | 7 ++++++- doc/source/user_guide/text.rst | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 315f19723de63..2a346fe854ff1 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -855,7 +855,12 @@ Note to aggregate over multiple value columns, we can pass in a list to the .. ipython:: python - df.pivot_table(values=["val0", "val1"], index="row", columns="col", aggfunc=["mean"]) + df.pivot_table( + values=["val0", "val1"], + index="row", + columns="col", + aggfunc=["mean"], + ) Note to subdivide over multiple columns we can pass in a list to the ``columns`` parameter. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 1dd567cabd1e2..2b27d37904599 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -662,7 +662,10 @@ You can check whether elements contain a pattern: .. ipython:: python pattern = r"[0-9][a-z]" - pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.contains(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.contains(pattern) Or whether elements match a pattern: From f089659d6151c29b10a6bebd207ff7b8c1f827f0 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 09:32:36 -0500 Subject: [PATCH 04/10] Another --- doc/source/user_guide/reshaping.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 2a346fe854ff1..77cf43b2e2b19 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -432,7 +432,12 @@ We can produce pivot tables from this data very easily: pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum) - pd.pivot_table(df, values=["D", "E"], index=["B"], columns=["A", "C"], aggfunc=np.sum) + pd.pivot_table( + df, values=["D", "E"], + index=["B"], + columns=["A", "C"], + aggfunc=np.sum, + ) The result object is a ``DataFrame`` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table From f57fa6c782a3e6183e43bfb30b787eeab251b372 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 2 Oct 2020 13:51:05 -0500 Subject: [PATCH 05/10] Fix --- .../intro_tutorials/06_calculate_statistics.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index 7e919777fdf03..6ce98ba5dbd1b 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -123,7 +123,10 @@ aggregating statistics for given columns can be defined using the .. ipython:: python titanic.agg( - {"Age": ["min", "max", "median", "skew"], "Fare": ["min", "max", "median", "mean"]} + { + "Age": ["min", "max", "median", "skew"], + "Fare": ["min", "max", "median", "mean"], + } ) .. raw:: html From 31032a5279251c3df1fd12cbbacfd169168581fa Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 2 Oct 2020 14:47:13 -0500 Subject: [PATCH 06/10] Fix --- doc/source/user_guide/cookbook.rst | 41 +++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 0a30d865f3c23..fe17007a70b26 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -266,7 +266,9 @@ New columns .. ipython:: python - df = pd.DataFrame({"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]}) + df = pd.DataFrame( + {"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]} + ) df Method 1 : idxmin() to get the index of the minimums @@ -327,7 +329,9 @@ Arithmetic .. ipython:: python - cols = pd.MultiIndex.from_tuples([(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]]) + cols = pd.MultiIndex.from_tuples( + [(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]] + ) df = pd.DataFrame(np.random.randn(2, 6), index=["n", "m"], columns=cols) df df = df.div(df["C"], level=1) @@ -566,7 +570,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]}) + df = pd.DataFrame( + {"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]} + ) df df["Counts"] = df.groupby(["Color"]).transform(len) df @@ -648,7 +654,10 @@ Create a list of dataframes, split using a delineation based on logic included i dfs = list( zip( *df.groupby( - (1 * (df["Case"] == "B")).cumsum().rolling(window=3, min_periods=1).median() + (1 * (df["Case"] == "B")) + .cumsum() + .rolling(window=3, min_periods=1) + .median() ) ) )[-1] @@ -740,7 +749,18 @@ The :ref:`Pivot ` docs. "yes", ], "Passed": ["yes" if x > 50 else "no" for x in grades], - "Employed": [True, True, True, False, False, False, False, True, True, False], + "Employed": [ + True, + True, + True, + False, + False, + False, + False, + True, + True, + False, + ], "Grade": grades, } ) @@ -791,7 +811,9 @@ Apply return pd.Series(aList) - df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}) + df_orgz = pd.concat( + {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} + ) df_orgz `Rolling apply with a DataFrame returning a Series @@ -1162,7 +1184,12 @@ Option 1: pass rows explicitly to skip rows from io import StringIO pd.read_csv( - StringIO(data), sep=";", skiprows=[11, 12], index_col=0, parse_dates=True, header=10 + StringIO(data), + sep=";", + skiprows=[11, 12], + index_col=0, + parse_dates=True, + header=10, ) Option 2: read column names and then data From ce985ba41d6d13b2108771eeca7fddf0313b9782 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 2 Oct 2020 18:09:12 -0500 Subject: [PATCH 07/10] Fix --- doc/source/user_guide/merging.rst | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 8dbfc261e6fa8..da16aaf5b3a56 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -1065,7 +1065,9 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - result = pd.merge(left, right, left_on="key", right_index=True, how="left", sort=False) + result = pd.merge( + left, right, left_on="key", right_index=True, how="left", sort=False + ) .. ipython:: python :suppress: @@ -1196,7 +1198,9 @@ the left argument, as in this example: left = pd.DataFrame({"v1": range(12)}, index=leftindex) left - rightindex = pd.MultiIndex.from_product([list("abc"), list("xy")], names=["abc", "xy"]) + rightindex = pd.MultiIndex.from_product( + [list("abc"), list("xy")], names=["abc", "xy"] + ) right = pd.DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) right @@ -1210,7 +1214,9 @@ done using the following code. leftindex = pd.MultiIndex.from_tuples( [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] ) - left = pd.DataFrame({"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex + ) rightindex = pd.MultiIndex.from_tuples( [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] @@ -1376,7 +1382,9 @@ one object from values for matching indices in the other. Here is an example: .. ipython:: python - df1 = pd.DataFrame([[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]]) + df1 = pd.DataFrame( + [[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]] + ) df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) For this, use the :meth:`~DataFrame.combine_first` method: From 33a2e260f6d91cd72914d5ff026d5eb68472d8da Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 3 Oct 2020 11:07:15 -0500 Subject: [PATCH 08/10] Fix --- doc/source/user_guide/computation.rst | 6 ++- doc/source/user_guide/groupby.rst | 9 +++- doc/source/user_guide/missing_data.rst | 5 ++- doc/source/user_guide/timeseries.rst | 57 ++++++++++++++++++++----- doc/source/user_guide/visualization.rst | 6 ++- 5 files changed, 67 insertions(+), 16 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 2f6ac6b06d85e..75fb3380821d8 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -787,7 +787,11 @@ can even be omitted: .. ipython:: python - covs = df[["B", "C", "D"]].rolling(window=50).cov(df[["A", "B", "C"]], pairwise=True) + covs = ( + df[["B", "C", "D"]] + .rolling(window=50) + .cov(df[["A", "B", "C"]], pairwise=True) + ) covs.loc["2002-09-22":] .. ipython:: python diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 9696f14f03b56..bd0a8a70a3ff7 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -267,7 +267,9 @@ the length of the ``groups`` dict, so it is largely just a convenience: height = np.random.normal(60, 10, size=n) time = pd.date_range("1/1/2000", periods=n) gender = np.random.choice(["male", "female"], size=n) - df = pd.DataFrame({"height": height, "weight": weight, "gender": gender}, index=time) + df = pd.DataFrame( + {"height": height, "weight": weight, "gender": gender}, index=time + ) .. ipython:: python @@ -767,7 +769,10 @@ For example, suppose we wished to standardize the data within each group: ts.head() ts.tail() - transformed = ts.groupby(lambda x: x.year).transform(lambda x: (x - x.mean()) / x.std()) + transformed = ts.groupby(lambda x: x.year).transform( + lambda x: (x - x.mean()) / x.std() + ) + We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check: diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 3c97cc7da6edb..f83da2e1cc8df 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -400,7 +400,10 @@ You can also interpolate with a DataFrame: .. ipython:: python df = pd.DataFrame( - {"A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4]} + { + "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], + "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], + } ) df df.interpolate() diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 11ec90085d9bf..fc25b98daf7b8 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -317,7 +317,9 @@ which can be specified. These are computed from the starting point specified by .. ipython:: python - pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s") + pd.to_datetime( + [1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s" + ) pd.to_datetime( [1349720105100, 1349720105200, 1349720105300, 1349720105400, 1349720105500], @@ -707,7 +709,9 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python :okwarning: - dft_minute = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index) + dft_minute = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index + ) dft_minute["2011-12-31 23"] @@ -748,10 +752,11 @@ With no defaults. .. ipython:: python dft[ - datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime(2013, 2, 28, 10, 12, 0) + datetime.datetime(2013, 1, 1, 10, 12, 0) : datetime.datetime( + 2013, 2, 28, 10, 12, 0 + ) ] - Truncating & fancy indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1036,8 +1041,15 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ["2012-05-01", datetime.datetime(2013, 5, 1), np.datetime64("2014-05-01")] - bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] + bday_egypt = pd.offsets.CustomBusinessDay( + holidays=holidays, + weekmask=weekmask_egypt, + ) dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt @@ -1417,7 +1429,12 @@ An example of how holidays and holiday calendars are defined: rules = [ USMemorialDay, Holiday("July 4th", month=7, day=4, observance=nearest_workday), - Holiday("Columbus Day", month=10, day=1, offset=pd.DateOffset(weekday=MO(2))), + Holiday( + "Columbus Day", + month=10, + day=1, + offset=pd.DateOffset(weekday=MO(2)), + ), ] @@ -2279,7 +2296,12 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string rng_dateutil.tz # dateutil - utc special case - rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=dateutil.tz.tzutc()) + rng_utc = pd.date_range( + "3/6/2012 00:00", + periods=3, + freq="D", + tz=dateutil.tz.tzutc(), + ) rng_utc.tz .. versionadded:: 0.25.0 @@ -2287,7 +2309,12 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string .. ipython:: python # datetime.timezone - rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=datetime.timezone.utc) + rng_utc = pd.date_range( + "3/6/2012 00:00", + periods=3, + freq="D", + tz=datetime.timezone.utc, + ) rng_utc.tz Note that the ``UTC`` time zone is a special case in ``dateutil`` and should be constructed explicitly @@ -2440,10 +2467,18 @@ control over how they are handled. .. ipython:: python pd.Timestamp( - datetime.datetime(2019, 10, 27, 1, 30, 0, 0), tz="dateutil/Europe/London", fold=0 + datetime.datetime(2019, 10, 27, 1, 30, 0, 0), + tz="dateutil/Europe/London", + fold=0, ) pd.Timestamp( - year=2019, month=10, day=27, hour=1, minute=30, tz="dateutil/Europe/London", fold=1 + year=2019, + month=10, + day=27, + hour=1, + minute=30, + tz="dateutil/Europe/London", + fold=1, ) .. _timeseries.timezone_ambiguous: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 46ab29a52747a..934bc59b29dc5 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1453,7 +1453,11 @@ Here is an example of one way to easily plot group means with standard deviation ) df3 = pd.DataFrame( - {"data1": [3, 2, 4, 3, 2, 4, 3, 2], "data2": [6, 5, 7, 5, 4, 5, 6, 5]}, index=ix3 + { + "data1": [3, 2, 4, 3, 2, 4, 3, 2], + "data2": [6, 5, 7, 5, 4, 5, 6, 5], + }, + index=ix3, ) # Group by index labels and take the means and standard deviations From aad946880d71fb93ff1034f6dc2fbf58c9cb9df2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 3 Oct 2020 11:45:24 -0500 Subject: [PATCH 09/10] Fix --- doc/source/user_guide/timeseries.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index fc25b98daf7b8..4141c579e877a 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -752,7 +752,7 @@ With no defaults. .. ipython:: python dft[ - datetime.datetime(2013, 1, 1, 10, 12, 0) : datetime.datetime( + datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime( 2013, 2, 28, 10, 12, 0 ) ] From def814fe54783028ac76d61d3cc1b027938143ed Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 5 Oct 2020 12:55:19 -0500 Subject: [PATCH 10/10] Fix --- doc/source/user_guide/advanced.rst | 11 ++++++--- doc/source/user_guide/basics.rst | 36 +++++++++++++++++++++++------- doc/source/whatsnew/v0.10.1.rst | 4 +++- doc/source/whatsnew/v0.13.1.rst | 8 +++++-- doc/source/whatsnew/v0.14.1.rst | 4 +++- doc/source/whatsnew/v0.17.0.rst | 4 +++- doc/source/whatsnew/v0.19.0.rst | 4 +++- doc/source/whatsnew/v0.8.0.rst | 4 +++- 8 files changed, 57 insertions(+), 18 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index cec777e0f021e..2cd48ac7adb0e 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -304,7 +304,8 @@ whereas a tuple of lists refer to several values within a level: .. ipython:: python s = pd.Series( - [1, 2, 3, 4, 5, 6], index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]) + [1, 2, 3, 4, 5, 6], + index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]), ) s.loc[[("A", "c"), ("B", "d")]] # list of tuples s.loc[(["A", "B"], ["c", "d"])] # tuple of lists @@ -819,7 +820,9 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")}) + df3 = pd.DataFrame( + {"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")} + ) df3 = df3.set_index("B") df3 @@ -934,7 +937,9 @@ example, be millisecond offsets. np.random.randn(5, 2), index=np.arange(5) * 250.0, columns=list("AB") ), pd.DataFrame( - np.random.randn(6, 2), index=np.arange(4, 10) * 250.1, columns=list("AB") + np.random.randn(6, 2), + index=np.arange(4, 10) * 250.1, + columns=list("AB"), ), ] ) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 8c01913e55318..53fabf94e24e0 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -464,7 +464,10 @@ which we illustrate: {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} ) df2 = pd.DataFrame( - {"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0]} + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } ) df1 df2 @@ -712,7 +715,10 @@ Similarly, you can get the most frequently occurring value(s), i.e. the mode, of s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() df5 = pd.DataFrame( - {"A": np.random.randint(0, 7, size=50), "B": np.random.randint(-10, 15, size=50)} + { + "A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50), + } ) df5.mode() @@ -1192,7 +1198,9 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = pd.Series(["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]) + s = pd.Series( + ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"] + ) t = pd.Series({"six": 6.0, "seven": 7.0}) s s.map(t) @@ -1494,7 +1502,9 @@ labels). df = pd.DataFrame( {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]}, - index=pd.MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["let", "num"]), + index=pd.MultiIndex.from_product( + [["a", "b", "c"], [1, 2]], names=["let", "num"] + ), ) df df.rename_axis(index={"let": "abc"}) @@ -1803,7 +1813,9 @@ used to sort a pandas object by its index levels. } ) - unsorted_df = df.reindex(index=["a", "d", "c", "b"], columns=["three", "two", "one"]) + unsorted_df = df.reindex( + index=["a", "d", "c", "b"], columns=["three", "two", "one"] + ) unsorted_df # DataFrame @@ -1849,7 +1861,9 @@ to use to determine the sorted order. .. ipython:: python - df1 = pd.DataFrame({"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]}) + df1 = pd.DataFrame( + {"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]} + ) df1.sort_values(by="two") The ``by`` parameter can take a list of column names, e.g.: @@ -1994,7 +2008,9 @@ all levels to ``by``. .. ipython:: python - df1.columns = pd.MultiIndex.from_tuples([("a", "one"), ("a", "two"), ("b", "three")]) + df1.columns = pd.MultiIndex.from_tuples( + [("a", "one"), ("a", "two"), ("b", "three")] + ) df1.sort_values(by=("a", "two")) @@ -2245,7 +2261,11 @@ to the correct type. import datetime df = pd.DataFrame( - [[1, 2], ["a", "b"], [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]] + [ + [1, 2], + ["a", "b"], + [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)], + ] ) df = df.T df diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index d71a0d5ca68cd..611ac2021fcec 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -180,7 +180,9 @@ combined result, by using ``where`` on a selector table. store.select("df2_mt") # as a multiple - store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") + store.select_as_multiple( + ["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt" + ) .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 1215786b4cccc..249b9555b7fd4 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -101,7 +101,9 @@ Output formatting enhancements .. ipython:: python - df = pd.DataFrame([pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"]) + df = pd.DataFrame( + [pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"] + ) df["today"] = pd.Timestamp("20130419") df["diff"] = df["today"] - df["age"] df @@ -206,7 +208,9 @@ Enhancements .. code-block:: python # Try to infer the format for the index column - df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) + df = pd.read_csv( + "foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True + ) - ``date_format`` and ``datetime_format`` keywords can now be specified when writing to ``excel`` files (:issue:`4133`) diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 78fd182ea86c3..a8f8955c3c1b9 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -124,7 +124,9 @@ Enhancements .. ipython:: python - rng = pd.date_range("3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London") + rng = pd.date_range( + "3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London" + ) rng.tz See :ref:`the docs `. diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 1658f877f5523..d8f39a7d6e3c0 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -786,7 +786,9 @@ Previous behavior: .. ipython:: python - df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame( + {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + ) df_with_missing diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 08ccc1565125f..2ac7b0f54361b 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1091,7 +1091,9 @@ Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex` .. ipython:: python pd.Index([1, 2, 3]).unique() - pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="Asia/Tokyo").unique() + pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], tz="Asia/Tokyo" + ).unique() .. _whatsnew_0190.api.multiindex: diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 8a84630a28b34..b34c2a5c6a07c 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -178,7 +178,9 @@ types. For example, ``'kde'`` is a new option: .. ipython:: python - s = pd.Series(np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))) + s = pd.Series( + np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3)) + ) plt.figure() s.hist(density=True, alpha=0.2) s.plot(kind="kde")