From 5ca6b98c0e5c552833350fa111db9c1e3d47e3bc Mon Sep 17 00:00:00 2001
From: PrayagS <prayag.s@ahduni.edu.in>
Date: Sat, 3 Oct 2020 09:22:15 +0530
Subject: [PATCH 1/4] DOC: use black to fix development/extending.rst
 pandas-dev#36777

---
 doc/source/development/extending.rst | 34 +++++++++++++++-------------
 1 file changed, 18 insertions(+), 16 deletions(-)
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index c708ebb361ed1..411cbe1000405 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -34,7 +34,7 @@ decorate a class, providing the name of attribute to add. The class's
        @staticmethod
        def _validate(obj):
            # verify there is a column latitude and a column longitude
-           if 'latitude' not in obj.columns or 'longitude' not in obj.columns:
+           if "latitude" not in obj.columns or "longitude" not in obj.columns:
                raise AttributeError("Must have 'latitude' and 'longitude'.")
 
        @property
@@ -50,8 +50,9 @@ decorate a class, providing the name of attribute to add. The class's
 
 Now users can access your methods using the ``geo`` namespace:
 
-      >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10),
-      ...                    'latitude': np.linspace(0, 20)})
+      >>> ds = pd.Dataframe(
+      ...   {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)}
+      ... )
       >>> ds.geo.center
       (5.0, 10.0)
       >>> ds.geo.plot()
@@ -271,6 +272,7 @@ included as a column in a pandas DataFrame):
         def __arrow_array__(self, type=None):
             # convert the underlying array values to a pyarrow Array
             import pyarrow
+
             return pyarrow.array(..., type=type)
 
 The ``ExtensionDtype.__from_arrow__`` method then controls the conversion
@@ -377,7 +379,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
    >>> type(to_framed)
    <class '__main__.SubclassedDataFrame'>
 
-   >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
+   >>> df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
    >>> df
       A  B  C
    0  1  4  7
@@ -387,7 +389,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
    >>> type(df)
    <class '__main__.SubclassedDataFrame'>
 
-   >>> sliced1 = df[['A', 'B']]
+   >>> sliced1 = df[["A", "B"]]
    >>> sliced1
       A  B
    0  1  4
@@ -397,7 +399,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
    >>> type(sliced1)
    <class '__main__.SubclassedDataFrame'>
 
-   >>> sliced2 = df['A']
+   >>> sliced2 = df["A"]
    >>> sliced2
    0    1
    1    2
@@ -422,27 +424,27 @@ Below is an example to define two original properties, "internal_cache" as a tem
    class SubclassedDataFrame2(pd.DataFrame):
 
        # temporary properties
-       _internal_names = pd.DataFrame._internal_names + ['internal_cache']
+       _internal_names = pd.DataFrame._internal_names + ["internal_cache"]
        _internal_names_set = set(_internal_names)
 
        # normal properties
-       _metadata = ['added_property']
+       _metadata = ["added_property"]
 
        @property
        def _constructor(self):
-           return SubclassedDataFrame2
+          return SubclassedDataFrame2
 
 .. code-block:: python
 
-   >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
+   >>> df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
    >>> df
       A  B  C
    0  1  4  7
    1  2  5  8
    2  3  6  9
 
-   >>> df.internal_cache = 'cached'
-   >>> df.added_property = 'property'
+   >>> df.internal_cache = "cached"
+   >>> df.added_property = "property"
 
    >>> df.internal_cache
    cached
@@ -450,11 +452,11 @@ Below is an example to define two original properties, "internal_cache" as a tem
    property
 
    # properties defined in _internal_names is reset after manipulation
-   >>> df[['A', 'B']].internal_cache
+   >>> df[["A", "B"]].internal_cache
    AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache'
 
    # properties defined in _metadata are retained
-   >>> df[['A', 'B']].added_property
+   >>> df[["A", "B"]].added_property
    property
 
 .. _extending.plotting-backends:
@@ -468,7 +470,7 @@ one based on Matplotlib. For example:
 
 .. code-block:: python
 
-    >>> pd.set_option('plotting.backend', 'backend.module')
+    >>> pd.set_option("plotting.backend", "backend.module")
     >>> pd.Series([1, 2, 3]).plot()
 
 This would be more or less equivalent to:
@@ -499,4 +501,4 @@ registers the default "matplotlib" backend as follows.
 
 
 More information on how to implement a third-party plotting backend can be found at
-https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.
\ No newline at end of file
+https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.

From 1562b796fc5273b7343405a51b77e0c04d6b4e5f Mon Sep 17 00:00:00 2001
From: PrayagS <prayag.s@ahduni.edu.in>
Date: Sat, 3 Oct 2020 09:37:40 +0530
Subject: [PATCH 2/4] DOC: fix flake8-rst errors in extending.rst

---
 doc/source/development/extending.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index 411cbe1000405..961053b11180a 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -432,7 +432,7 @@ Below is an example to define two original properties, "internal_cache" as a tem
 
        @property
        def _constructor(self):
-          return SubclassedDataFrame2
+           return SubclassedDataFrame2
 
 .. code-block:: python
 

From e2ca2ee6feb87023deeff73bbc67ef73af8b129e Mon Sep 17 00:00:00 2001
From: PrayagS <prayag.s@ahduni.edu.in>
Date: Sat, 3 Oct 2020 10:01:45 +0530
Subject: [PATCH 3/4] DOC: use black to fix code style in user_guide

---
 doc/source/user_guide/duplicates.rst | 35 ++++++++++++----------------
 doc/source/user_guide/gotchas.rst    | 28 +++++++++++++---------
 doc/source/user_guide/scale.rst      | 35 ++++++++++++++--------------
 3 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst
index b65822fab2b23..2993ca7799510 100644
--- a/doc/source/user_guide/duplicates.rst
+++ b/doc/source/user_guide/duplicates.rst
@@ -29,8 +29,8 @@ duplicates present. The output can't be determined, and so pandas raises.
 .. ipython:: python
    :okexcept:
 
-   s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b'])
-   s1.reindex(['a', 'b', 'c'])
+   s1 = pd.Series([0, 1, 2], index=["a", "b", "b"])
+   s1.reindex(["a", "b", "c"])
 
 Other methods, like indexing, can give very surprising results. Typically
 indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame``
@@ -39,30 +39,30 @@ return a scalar. But with duplicates, this isn't the case.
 
 .. ipython:: python
 
-   df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B'])
+   df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "A", "B"])
    df1
 
 We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series``
 
 .. ipython:: python
 
-   df1['B']  # a series
+   df1["B"]  # a series
 
 But slicing ``'A'`` returns a ``DataFrame``
 
 
 .. ipython:: python
 
-   df1['A']  # a DataFrame
+   df1["A"]  # a DataFrame
 
 This applies to row labels as well
 
 .. ipython:: python
 
-   df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b'])
+   df2 = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "a", "b"])
    df2
-   df2.loc['b', 'A']  # a scalar
-   df2.loc['a', 'A']  # a Series
+   df2.loc["b", "A"]  # a scalar
+   df2.loc["a", "A"]  # a Series
 
 Duplicate Label Detection
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -121,29 +121,24 @@ will be raised.
 .. ipython:: python
    :okexcept:
 
-   pd.Series(
-       [0, 1, 2],
-       index=['a', 'b', 'b']
-   ).set_flags(allows_duplicate_labels=False)
+   pd.Series([0, 1, 2], index=["a", "b", "b"]).set_flags(allows_duplicate_labels=False)
 
 This applies to both row and column labels for a :class:`DataFrame`
 
 .. ipython:: python
    :okexcept:
 
-   pd.DataFrame(
-       [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],
-   ).set_flags(allows_duplicate_labels=False)
+   pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags(
+       allows_duplicate_labels=False
+   )
 
 This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`,
 which indicates whether that object can have duplicate labels.
 
 .. ipython:: python
 
-   df = (
-       pd.DataFrame({"A": [0, 1, 2, 3]},
-                    index=['x', 'y', 'X', 'Y'])
-         .set_flags(allows_duplicate_labels=False)
+   df = pd.DataFrame({"A": [0, 1, 2, 3]}, index=["x", "y", "X", "Y"]).set_flags(
+       allows_duplicate_labels=False
    )
    df
    df.flags.allows_duplicate_labels
@@ -198,7 +193,7 @@ operations.
 .. ipython:: python
    :okexcept:
 
-   s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False)
+   s1 = pd.Series(0, index=["a", "b"]).set_flags(allows_duplicate_labels=False)
    s1
    s1.head().rename({"a": "b"})
 
diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
index a96c70405d859..07c856c96426d 100644
--- a/doc/source/user_guide/gotchas.rst
+++ b/doc/source/user_guide/gotchas.rst
@@ -21,12 +21,19 @@ when calling :meth:`~DataFrame.info`:
 
 .. ipython:: python
 
-    dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
-              'complex128', 'object', 'bool']
+    dtypes = [
+        "int64",
+        "float64",
+        "datetime64[ns]",
+        "timedelta64[ns]",
+        "complex128",
+        "object",
+        "bool",
+    ]
     n = 5000
     data = {t: np.random.randint(100, size=n).astype(t) for t in dtypes}
     df = pd.DataFrame(data)
-    df['categorical'] = df['object'].astype('category')
+    df["categorical"] = df["object"].astype("category")
 
     df.info()
 
@@ -40,7 +47,7 @@ as it can be expensive to do this deeper introspection.
 
 .. ipython:: python
 
-   df.info(memory_usage='deep')
+   df.info(memory_usage="deep")
 
 By default the display option is set to ``True`` but can be explicitly
 overridden by passing the ``memory_usage`` argument when invoking ``df.info()``.
@@ -155,7 +162,7 @@ index, not membership among the values.
 
 .. ipython:: python
 
-    s = pd.Series(range(5), index=list('abcde'))
+    s = pd.Series(range(5), index=list("abcde"))
     2 in s
     'b' in s
 
@@ -206,11 +213,11 @@ arrays. For example:
 
 .. ipython:: python
 
-   s = pd.Series([1, 2, 3, 4, 5], index=list('abcde'))
+   s = pd.Series([1, 2, 3, 4, 5], index=list("abcde"))
    s
    s.dtype
 
-   s2 = s.reindex(['a', 'b', 'c', 'f', 'u'])
+   s2 = s.reindex(["a", "b", "c", "f", "u"])
    s2
    s2.dtype
 
@@ -227,12 +234,11 @@ the nullable-integer extension dtypes provided by pandas
 
 .. ipython:: python
 
-   s_int = pd.Series([1, 2, 3, 4, 5], index=list('abcde'),
-                     dtype=pd.Int64Dtype())
+   s_int = pd.Series([1, 2, 3, 4, 5], index=list("abcde"), dtype=pd.Int64Dtype())
    s_int
    s_int.dtype
 
-   s2_int = s_int.reindex(['a', 'b', 'c', 'f', 'u'])
+   s2_int = s_int.reindex(["a", "b", "c", "f", "u"])
    s2_int
    s2_int.dtype
 
@@ -334,7 +340,7 @@ constructors using something similar to the following:
 
 .. ipython:: python
 
-   x = np.array(list(range(10)), '>i4')  # big endian
+   x = np.array(list(range(10)), ">i4")  # big endian
    newx = x.byteswap().newbyteorder()  # force native byteorder
    s = pd.Series(newx)
 
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
index 206d8dd0f4739..f36f27269a996 100644
--- a/doc/source/user_guide/scale.rst
+++ b/doc/source/user_guide/scale.rst
@@ -72,7 +72,7 @@ Option 1 loads in all the data and then filters to what we need.
 
 .. ipython:: python
 
-   columns = ['id_0', 'name_0', 'x_0', 'y_0']
+   columns = ["id_0", "name_0", "x_0", "y_0"]
 
    pd.read_parquet("timeseries_wide.parquet")[columns]
 
@@ -123,7 +123,7 @@ space-efficient integers to know which specific name is used in each row.
 .. ipython:: python
 
    ts2 = ts.copy()
-   ts2['name'] = ts2['name'].astype('category')
+   ts2["name"] = ts2["name"].astype("category")
    ts2.memory_usage(deep=True)
 
 We can go a bit further and downcast the numeric columns to their smallest types
@@ -131,8 +131,8 @@ using :func:`pandas.to_numeric`.
 
 .. ipython:: python
 
-   ts2['id'] = pd.to_numeric(ts2['id'], downcast='unsigned')
-   ts2[['x', 'y']] = ts2[['x', 'y']].apply(pd.to_numeric, downcast='float')
+   ts2["id"] = pd.to_numeric(ts2["id"], downcast="unsigned")
+   ts2[["x", "y"]] = ts2[["x", "y"]].apply(pd.to_numeric, downcast="float")
    ts2.dtypes
 
 .. ipython:: python
@@ -141,8 +141,7 @@ using :func:`pandas.to_numeric`.
 
 .. ipython:: python
 
-   reduction = (ts2.memory_usage(deep=True).sum()
-                / ts.memory_usage(deep=True).sum())
+   reduction = ts2.memory_usage(deep=True).sum() / ts.memory_usage(deep=True).sum()
    print(f"{reduction:0.2f}")
 
 In all, we've reduced the in-memory footprint of this dataset to 1/5 of its
@@ -174,13 +173,13 @@ files. Each file in the directory represents a different year of the entire data
    import pathlib
 
    N = 12
-   starts = [f'20{i:>02d}-01-01' for i in range(N)]
-   ends = [f'20{i:>02d}-12-13' for i in range(N)]
+   starts = [f"20{i:>02d}-01-01" for i in range(N)]
+   ends = [f"20{i:>02d}-12-13" for i in range(N)]
 
    pathlib.Path("data/timeseries").mkdir(exist_ok=True)
 
    for i, (start, end) in enumerate(zip(starts, ends)):
-       ts = _make_timeseries(start=start, end=end, freq='1T', seed=i)
+       ts = _make_timeseries(start=start, end=end, freq="1T", seed=i)
        ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet")
 
 
@@ -215,7 +214,7 @@ work for arbitrary-sized datasets.
        # Only one dataframe is in memory at a time...
        df = pd.read_parquet(path)
        # ... plus a small Series ``counts``, which is updated.
-       counts = counts.add(df['name'].value_counts(), fill_value=0)
+       counts = counts.add(df["name"].value_counts(), fill_value=0)
    counts.astype(int)
 
 Some readers, like :meth:`pandas.read_csv`, offer parameters to control the
@@ -278,8 +277,8 @@ Rather than executing immediately, doing operations build up a **task graph**.
 .. ipython:: python
 
    ddf
-   ddf['name']
-   ddf['name'].value_counts()
+   ddf["name"]
+   ddf["name"].value_counts()
 
 Each of these calls is instant because the result isn't being computed yet.
 We're just building up a list of computation to do when someone needs the
@@ -291,7 +290,7 @@ To get the actual result you can call ``.compute()``.
 
 .. ipython:: python
 
-   %time ddf['name'].value_counts().compute()
+   %time ddf["name"].value_counts().compute()
 
 At that point, you get back the same thing you'd get with pandas, in this case
 a concrete pandas Series with the count of each ``name``.
@@ -324,7 +323,7 @@ a familiar groupby aggregation.
 
 .. ipython:: python
 
-   %time ddf.groupby('name')[['x', 'y']].mean().compute().head()
+   %time ddf.groupby("name")[["x", "y"]].mean().compute().head()
 
 The grouping and aggregation is done out-of-core and in parallel.
 
@@ -336,8 +335,8 @@ we need to supply the divisions manually.
 .. ipython:: python
 
    N = 12
-   starts = [f'20{i:>02d}-01-01' for i in range(N)]
-   ends = [f'20{i:>02d}-12-13' for i in range(N)]
+   starts = [f"20{i:>02d}-01-01" for i in range(N)]
+   ends = [f"20{i:>02d}-12-13" for i in range(N)]
 
    divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),)
    ddf.divisions = divisions
@@ -347,7 +346,7 @@ Now we can do things like fast random access with ``.loc``.
 
 .. ipython:: python
 
-   ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute()
+   ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute()
 
 Dask knows to just look in the 3rd partition for selecting values in 2002. It
 doesn't need to look at any other data.
@@ -362,7 +361,7 @@ out of memory. At that point it's just a regular pandas object.
    :okwarning:
 
    @savefig dask_resample.png
-   ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot()
+   ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot()
 
 These Dask examples have all be done using multiple processes on a single
 machine. Dask can be `deployed on a cluster

From f438945f6ceb059f7a267f321ab12d4fcb38b41a Mon Sep 17 00:00:00 2001
From: PrayagS <prayag.s@ahduni.edu.in>
Date: Sat, 3 Oct 2020 11:52:35 +0530
Subject: [PATCH 4/4] DOC: fix under-indented line in development/extending.rst

---
 doc/source/development/extending.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index 961053b11180a..6fabffc314fe5 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -51,7 +51,7 @@ decorate a class, providing the name of attribute to add. The class's
 Now users can access your methods using the ``geo`` namespace:
 
       >>> ds = pd.Dataframe(
-      ...   {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)}
+      ...     {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)}
       ... )
       >>> ds.geo.center
       (5.0, 10.0)