From 5ca6b98c0e5c552833350fa111db9c1e3d47e3bc Mon Sep 17 00:00:00 2001 From: PrayagS Date: Sat, 3 Oct 2020 09:22:15 +0530 Subject: [PATCH 1/4] DOC: use black to fix development/extending.rst pandas-dev#36777 --- doc/source/development/extending.rst | 34 +++++++++++++++------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index c708ebb361ed1..411cbe1000405 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -34,7 +34,7 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): # verify there is a column latitude and a column longitude - if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + if "latitude" not in obj.columns or "longitude" not in obj.columns: raise AttributeError("Must have 'latitude' and 'longitude'.") @property @@ -50,8 +50,9 @@ decorate a class, providing the name of attribute to add. The class's Now users can access your methods using the ``geo`` namespace: - >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) + >>> ds = pd.Dataframe( + ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} + ... ) >>> ds.geo.center (5.0, 10.0) >>> ds.geo.plot() @@ -271,6 +272,7 @@ included as a column in a pandas DataFrame): def __arrow_array__(self, type=None): # convert the underlying array values to a pyarrow Array import pyarrow + return pyarrow.array(..., type=type) The ``ExtensionDtype.__from_arrow__`` method then controls the conversion @@ -377,7 +379,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(to_framed) - >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -387,7 +389,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(df) - >>> sliced1 = df[['A', 'B']] + >>> sliced1 = df[["A", "B"]] >>> sliced1 A B 0 1 4 @@ -397,7 +399,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df["A"] >>> sliced2 0 1 1 2 @@ -422,27 +424,27 @@ Below is an example to define two original properties, "internal_cache" as a tem class SubclassedDataFrame2(pd.DataFrame): # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] _internal_names_set = set(_internal_names) # normal properties - _metadata = ['added_property'] + _metadata = ["added_property"] @property def _constructor(self): - return SubclassedDataFrame2 + return SubclassedDataFrame2 .. code-block:: python - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 1 2 5 8 2 3 6 9 - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' + >>> df.internal_cache = "cached" + >>> df.added_property = "property" >>> df.internal_cache cached @@ -450,11 +452,11 @@ Below is an example to define two original properties, "internal_cache" as a tem property # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache + >>> df[["A", "B"]].internal_cache AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property + >>> df[["A", "B"]].added_property property .. _extending.plotting-backends: @@ -468,7 +470,7 @@ one based on Matplotlib. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: @@ -499,4 +501,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. \ No newline at end of file +https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. From 1562b796fc5273b7343405a51b77e0c04d6b4e5f Mon Sep 17 00:00:00 2001 From: PrayagS Date: Sat, 3 Oct 2020 09:37:40 +0530 Subject: [PATCH 2/4] DOC: fix flake8-rst errors in extending.rst --- doc/source/development/extending.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 411cbe1000405..961053b11180a 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -432,7 +432,7 @@ Below is an example to define two original properties, "internal_cache" as a tem @property def _constructor(self): - return SubclassedDataFrame2 + return SubclassedDataFrame2 .. code-block:: python From e2ca2ee6feb87023deeff73bbc67ef73af8b129e Mon Sep 17 00:00:00 2001 From: PrayagS Date: Sat, 3 Oct 2020 10:01:45 +0530 Subject: [PATCH 3/4] DOC: use black to fix code style in user_guide --- doc/source/user_guide/duplicates.rst | 35 ++++++++++++---------------- doc/source/user_guide/gotchas.rst | 28 +++++++++++++--------- doc/source/user_guide/scale.rst | 35 ++++++++++++++-------------- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index b65822fab2b23..2993ca7799510 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -29,8 +29,8 @@ duplicates present. The output can't be determined, and so pandas raises. .. ipython:: python :okexcept: - s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b']) - s1.reindex(['a', 'b', 'c']) + s1 = pd.Series([0, 1, 2], index=["a", "b", "b"]) + s1.reindex(["a", "b", "c"]) Other methods, like indexing, can give very surprising results. Typically indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame`` @@ -39,30 +39,30 @@ return a scalar. But with duplicates, this isn't the case. .. ipython:: python - df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B']) + df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "A", "B"]) df1 We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series`` .. ipython:: python - df1['B'] # a series + df1["B"] # a series But slicing ``'A'`` returns a ``DataFrame`` .. ipython:: python - df1['A'] # a DataFrame + df1["A"] # a DataFrame This applies to row labels as well .. ipython:: python - df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b']) + df2 = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "a", "b"]) df2 - df2.loc['b', 'A'] # a scalar - df2.loc['a', 'A'] # a Series + df2.loc["b", "A"] # a scalar + df2.loc["a", "A"] # a Series Duplicate Label Detection ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -121,29 +121,24 @@ will be raised. .. ipython:: python :okexcept: - pd.Series( - [0, 1, 2], - index=['a', 'b', 'b'] - ).set_flags(allows_duplicate_labels=False) + pd.Series([0, 1, 2], index=["a", "b", "b"]).set_flags(allows_duplicate_labels=False) This applies to both row and column labels for a :class:`DataFrame` .. ipython:: python :okexcept: - pd.DataFrame( - [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], - ).set_flags(allows_duplicate_labels=False) + pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags( + allows_duplicate_labels=False + ) This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, which indicates whether that object can have duplicate labels. .. ipython:: python - df = ( - pd.DataFrame({"A": [0, 1, 2, 3]}, - index=['x', 'y', 'X', 'Y']) - .set_flags(allows_duplicate_labels=False) + df = pd.DataFrame({"A": [0, 1, 2, 3]}, index=["x", "y", "X", "Y"]).set_flags( + allows_duplicate_labels=False ) df df.flags.allows_duplicate_labels @@ -198,7 +193,7 @@ operations. .. ipython:: python :okexcept: - s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False) + s1 = pd.Series(0, index=["a", "b"]).set_flags(allows_duplicate_labels=False) s1 s1.head().rename({"a": "b"}) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index a96c70405d859..07c856c96426d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -21,12 +21,19 @@ when calling :meth:`~DataFrame.info`: .. ipython:: python - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] n = 5000 data = {t: np.random.randint(100, size=n).astype(t) for t in dtypes} df = pd.DataFrame(data) - df['categorical'] = df['object'].astype('category') + df["categorical"] = df["object"].astype("category") df.info() @@ -40,7 +47,7 @@ as it can be expensive to do this deeper introspection. .. ipython:: python - df.info(memory_usage='deep') + df.info(memory_usage="deep") By default the display option is set to ``True`` but can be explicitly overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. @@ -155,7 +162,7 @@ index, not membership among the values. .. ipython:: python - s = pd.Series(range(5), index=list('abcde')) + s = pd.Series(range(5), index=list("abcde")) 2 in s 'b' in s @@ -206,11 +213,11 @@ arrays. For example: .. ipython:: python - s = pd.Series([1, 2, 3, 4, 5], index=list('abcde')) + s = pd.Series([1, 2, 3, 4, 5], index=list("abcde")) s s.dtype - s2 = s.reindex(['a', 'b', 'c', 'f', 'u']) + s2 = s.reindex(["a", "b", "c", "f", "u"]) s2 s2.dtype @@ -227,12 +234,11 @@ the nullable-integer extension dtypes provided by pandas .. ipython:: python - s_int = pd.Series([1, 2, 3, 4, 5], index=list('abcde'), - dtype=pd.Int64Dtype()) + s_int = pd.Series([1, 2, 3, 4, 5], index=list("abcde"), dtype=pd.Int64Dtype()) s_int s_int.dtype - s2_int = s_int.reindex(['a', 'b', 'c', 'f', 'u']) + s2_int = s_int.reindex(["a", "b", "c", "f", "u"]) s2_int s2_int.dtype @@ -334,7 +340,7 @@ constructors using something similar to the following: .. ipython:: python - x = np.array(list(range(10)), '>i4') # big endian + x = np.array(list(range(10)), ">i4") # big endian newx = x.byteswap().newbyteorder() # force native byteorder s = pd.Series(newx) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 206d8dd0f4739..f36f27269a996 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -72,7 +72,7 @@ Option 1 loads in all the data and then filters to what we need. .. ipython:: python - columns = ['id_0', 'name_0', 'x_0', 'y_0'] + columns = ["id_0", "name_0", "x_0", "y_0"] pd.read_parquet("timeseries_wide.parquet")[columns] @@ -123,7 +123,7 @@ space-efficient integers to know which specific name is used in each row. .. ipython:: python ts2 = ts.copy() - ts2['name'] = ts2['name'].astype('category') + ts2["name"] = ts2["name"].astype("category") ts2.memory_usage(deep=True) We can go a bit further and downcast the numeric columns to their smallest types @@ -131,8 +131,8 @@ using :func:`pandas.to_numeric`. .. ipython:: python - ts2['id'] = pd.to_numeric(ts2['id'], downcast='unsigned') - ts2[['x', 'y']] = ts2[['x', 'y']].apply(pd.to_numeric, downcast='float') + ts2["id"] = pd.to_numeric(ts2["id"], downcast="unsigned") + ts2[["x", "y"]] = ts2[["x", "y"]].apply(pd.to_numeric, downcast="float") ts2.dtypes .. ipython:: python @@ -141,8 +141,7 @@ using :func:`pandas.to_numeric`. .. ipython:: python - reduction = (ts2.memory_usage(deep=True).sum() - / ts.memory_usage(deep=True).sum()) + reduction = ts2.memory_usage(deep=True).sum() / ts.memory_usage(deep=True).sum() print(f"{reduction:0.2f}") In all, we've reduced the in-memory footprint of this dataset to 1/5 of its @@ -174,13 +173,13 @@ files. Each file in the directory represents a different year of the entire data import pathlib N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] pathlib.Path("data/timeseries").mkdir(exist_ok=True) for i, (start, end) in enumerate(zip(starts, ends)): - ts = _make_timeseries(start=start, end=end, freq='1T', seed=i) + ts = _make_timeseries(start=start, end=end, freq="1T", seed=i) ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet") @@ -215,7 +214,7 @@ work for arbitrary-sized datasets. # Only one dataframe is in memory at a time... df = pd.read_parquet(path) # ... plus a small Series ``counts``, which is updated. - counts = counts.add(df['name'].value_counts(), fill_value=0) + counts = counts.add(df["name"].value_counts(), fill_value=0) counts.astype(int) Some readers, like :meth:`pandas.read_csv`, offer parameters to control the @@ -278,8 +277,8 @@ Rather than executing immediately, doing operations build up a **task graph**. .. ipython:: python ddf - ddf['name'] - ddf['name'].value_counts() + ddf["name"] + ddf["name"].value_counts() Each of these calls is instant because the result isn't being computed yet. We're just building up a list of computation to do when someone needs the @@ -291,7 +290,7 @@ To get the actual result you can call ``.compute()``. .. ipython:: python - %time ddf['name'].value_counts().compute() + %time ddf["name"].value_counts().compute() At that point, you get back the same thing you'd get with pandas, in this case a concrete pandas Series with the count of each ``name``. @@ -324,7 +323,7 @@ a familiar groupby aggregation. .. ipython:: python - %time ddf.groupby('name')[['x', 'y']].mean().compute().head() + %time ddf.groupby("name")[["x", "y"]].mean().compute().head() The grouping and aggregation is done out-of-core and in parallel. @@ -336,8 +335,8 @@ we need to supply the divisions manually. .. ipython:: python N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) ddf.divisions = divisions @@ -347,7 +346,7 @@ Now we can do things like fast random access with ``.loc``. .. ipython:: python - ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute() + ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() Dask knows to just look in the 3rd partition for selecting values in 2002. It doesn't need to look at any other data. @@ -362,7 +361,7 @@ out of memory. At that point it's just a regular pandas object. :okwarning: @savefig dask_resample.png - ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot() + ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() These Dask examples have all be done using multiple processes on a single machine. Dask can be `deployed on a cluster From f438945f6ceb059f7a267f321ab12d4fcb38b41a Mon Sep 17 00:00:00 2001 From: PrayagS Date: Sat, 3 Oct 2020 11:52:35 +0530 Subject: [PATCH 4/4] DOC: fix under-indented line in development/extending.rst --- doc/source/development/extending.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 961053b11180a..6fabffc314fe5 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -51,7 +51,7 @@ decorate a class, providing the name of attribute to add. The class's Now users can access your methods using the ``geo`` namespace: >>> ds = pd.Dataframe( - ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} + ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} ... ) >>> ds.geo.center (5.0, 10.0)