diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 201820c6a8b28..b7af033ede042 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -122,6 +122,14 @@ repos: files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==23.12.1 + types_or: [rst] + args: ["--skip-errors", "--skip-string-normalization"] - repo: local hooks: - id: pyright diff --git a/doc/source/conf.py b/doc/source/conf.py index 77dd5d03d311c..00c158022913d 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -133,6 +133,9 @@ numpydoc_show_inherited_class_members = False numpydoc_attributes_as_param_list = False +# IPython +ipython_warning_is_error = False + # matplotlib plot directive plot_include_source = True plot_formats = [("png", 90)] diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 39e279fd5c917..9754eeb8ea6ef 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -179,6 +179,7 @@ The appropriate way to annotate this would be as follows str_type = str + class SomeClass2: str: str_type = None @@ -190,8 +191,8 @@ In some cases you may be tempted to use ``cast`` from the typing module when you from pandas.core.dtypes.common import is_number - def cannot_infer_bad(obj: Union[str, int, float]): + def cannot_infer_bad(obj: Union[str, int, float]): if is_number(obj): ... else: # Reasonably only str objects would reach this but... @@ -203,7 +204,6 @@ The limitation here is that while a human can reasonably understand that ``is_nu .. code-block:: python def cannot_infer_good(obj: Union[str, int, float]): - if isinstance(obj, str): return obj.upper() else: @@ -222,6 +222,7 @@ For example, quite a few functions in pandas accept a ``dtype`` argument. This c from pandas._typing import Dtype + def as_type(dtype: Dtype) -> ...: ... @@ -428,6 +429,7 @@ be located. import pandas as pd import pandas._testing as tm + def test_getitem_listlike_of_ints(): ser = pd.Series(range(5)) @@ -641,9 +643,13 @@ as a comment to a new test. @pytest.mark.parametrize( - 'dtype', ['float32', pytest.param('int16', marks=pytest.mark.skip), - pytest.param('int32', marks=pytest.mark.xfail( - reason='to show how it works'))]) + 'dtype', + [ + 'float32', + pytest.param('int16', marks=pytest.mark.skip), + pytest.param('int32', marks=pytest.mark.xfail(reason='to show how it works')), + ], + ) def test_mark(dtype): assert str(np.dtype(dtype)) == 'float32' @@ -722,10 +728,16 @@ for details `_. import json from hypothesis import given, strategies as st - any_json_value = st.deferred(lambda: st.one_of( - st.none(), st.booleans(), st.floats(allow_nan=False), st.text(), - st.lists(any_json_value), st.dictionaries(st.text(), any_json_value) - )) + any_json_value = st.deferred( + lambda: st.one_of( + st.none(), + st.booleans(), + st.floats(allow_nan=False), + st.text(), + st.lists(any_json_value), + st.dictionaries(st.text(), any_json_value), + ) + ) @given(value=any_json_value) diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index e2881c1087e60..ec18439fa53d8 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -137,7 +137,6 @@ backticks. The following are considered inline code: .. code-block:: python def func(): - """Some function. With several mistakes in the docstring. @@ -463,6 +462,7 @@ With more than one value: import string + def random_letters(): """ Generate and return a sequence of random letters. @@ -478,8 +478,7 @@ With more than one value: String of random letters. """ length = np.random.randint(1, 10) - letters = ''.join(np.random.choice(string.ascii_lowercase) - for i in range(length)) + letters = ''.join(np.random.choice(string.ascii_lowercase) for i in range(length)) return length, letters If the method yields its value: @@ -628,7 +627,6 @@ A simple example could be: .. code-block:: python class Series: - def head(self, n=5): """ Return the first elements of the Series. @@ -724,7 +722,6 @@ positional arguments ``head(3)``. .. code-block:: python class Series: - def mean(self): """ Compute the mean of the input. @@ -737,7 +734,6 @@ positional arguments ``head(3)``. """ pass - def fillna(self, value): """ Replace missing values by ``value``. diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e67829b8805eb..c43a84b24e783 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -408,7 +408,6 @@ Below is an example to define two original properties, "internal_cache" as a tem .. code-block:: python class SubclassedDataFrame2(pd.DataFrame): - # temporary properties _internal_names = pd.DataFrame._internal_names + ["internal_cache"] _internal_names_set = set(_internal_names) @@ -526,6 +525,7 @@ The ``__pandas_priority__`` of :class:`DataFrame`, :class:`Series`, and :class:` # return `self` and not the addition for simplicity return self + custom = CustomList() series = pd.Series([1, 2, 3]) diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 5d833dca50732..7fa2609ae7ca8 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -144,6 +144,7 @@ create a file ``t.py`` in your pandas directory, which contains .. code-block:: python import pandas as pd + assert pd.Series([1, 1]).sum() == 2 and then run:: diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst index d55b669d94a87..96ac7aa948282 100644 --- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -427,9 +427,7 @@ The equivalent in pandas: .. ipython:: python - pd.pivot_table( - tips, values="tip", index=["size"], columns=["sex"], aggfunc=np.average - ) + pd.pivot_table(tips, values="tip", index=["size"], columns=["sex"], aggfunc=np.average) Adding a row @@ -440,8 +438,9 @@ Assuming we are using a :class:`~pandas.RangeIndex` (numbered ``0``, ``1``, etc. .. ipython:: python df - new_row = pd.DataFrame([["E", 51, True]], - columns=["class", "student_count", "all_pass"]) + new_row = pd.DataFrame( + [["E", 51, True]], columns=["class", "student_count", "all_pass"] + ) pd.concat([df, new_row]) diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index daa528c7d408a..b1c6474cdbc88 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -331,9 +331,7 @@ UNION df1 = pd.DataFrame( {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)} ) - df2 = pd.DataFrame( - {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]} - ) + df2 = pd.DataFrame({"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]}) .. code-block:: sql @@ -433,9 +431,7 @@ Top n rows per group ( tips.assign( - rn=tips.sort_values(["total_bill"], ascending=False) - .groupby(["day"]) - .cumcount() + rn=tips.sort_values(["total_bill"], ascending=False).groupby(["day"]).cumcount() + 1 ) .query("rn < 3") @@ -448,9 +444,7 @@ the same using ``rank(method='first')`` function ( tips.assign( - rnk=tips.groupby(["day"])["total_bill"].rank( - method="first", ascending=False - ) + rnk=tips.groupby(["day"])["total_bill"].rank(method="first", ascending=False) ) .query("rnk < 3") .sort_values(["day", "rnk"]) diff --git a/doc/source/getting_started/comparison/includes/time_date.rst b/doc/source/getting_started/comparison/includes/time_date.rst index fb9ee2e216cd7..905c62b7bc28a 100644 --- a/doc/source/getting_started/comparison/includes/time_date.rst +++ b/doc/source/getting_started/comparison/includes/time_date.rst @@ -5,13 +5,11 @@ tips["date1_year"] = tips["date1"].dt.year tips["date2_month"] = tips["date2"].dt.month tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() - tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ - "date1" - ].dt.to_period("M") + tips["months_between"] = tips["date2"].dt.to_period("M") - tips["date1"].dt.to_period( + "M" + ) - tips[ - ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] - ] + tips[["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]] .. ipython:: python :suppress: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 77e273d8c81fe..c8a5b705507fb 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -108,6 +108,7 @@ obtain these directories with. .. code-block:: python import sys + sys.path One way you could be encountering this error is if you have multiple Python installations on your system diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 9081f274cd941..533901d425b1b 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -40,10 +40,8 @@ Westminster* in respectively Paris, Antwerp and London. .. ipython:: python - air_quality_no2 = pd.read_csv("data/air_quality_no2_long.csv", - parse_dates=True) - air_quality_no2 = air_quality_no2[["date.utc", "location", - "parameter", "value"]] + air_quality_no2 = pd.read_csv("data/air_quality_no2_long.csv", parse_dates=True) + air_quality_no2 = air_quality_no2[["date.utc", "location", "parameter", "value"]] air_quality_no2.head() .. raw:: html @@ -75,10 +73,8 @@ Westminster* in respectively Paris, Antwerp and London. .. ipython:: python - air_quality_pm25 = pd.read_csv("data/air_quality_pm25_long.csv", - parse_dates=True) - air_quality_pm25 = air_quality_pm25[["date.utc", "location", - "parameter", "value"]] + air_quality_pm25 = pd.read_csv("data/air_quality_pm25_long.csv", parse_dates=True) + air_quality_pm25 = air_quality_pm25[["date.utc", "location", "parameter", "value"]] air_quality_pm25.head() .. raw:: html @@ -265,8 +261,9 @@ Add the parameters' full description and name, provided by the parameters metada .. ipython:: python - air_quality = pd.merge(air_quality, air_quality_parameters, - how='left', left_on='parameter', right_on='id') + air_quality = pd.merge( + air_quality, air_quality_parameters, how='left', left_on='parameter', right_on='id' + ) air_quality.head() Compared to the previous example, there is no common column name. diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index b0530087e5b84..01112943bda6a 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -174,8 +174,7 @@ What is the average :math:`NO_2` concentration for each day of the week for each .. ipython:: python - air_quality.groupby( - [air_quality["datetime"].dt.weekday, "location"])["value"].mean() + air_quality.groupby([air_quality["datetime"].dt.weekday, "location"])["value"].mean() Remember the split-apply-combine pattern provided by ``groupby`` from the :ref:`tutorial on statistics calculation <10min_tut_06_stats>`? diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 3cdcb81c14961..070c05cfc63d3 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -550,8 +550,8 @@ Stack .. ipython:: python arrays = [ - ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], - ["one", "two", "one", "two", "one", "two", "one", "two"], + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], ] index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index f7ab466e92d93..171beb86e7452 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -792,9 +792,7 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame( - {"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")} - ) + df3 = pd.DataFrame({"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")}) df3 = df3.set_index("B") df3 @@ -1096,7 +1094,7 @@ index can be somewhat complicated. For example, the following does not work: .. ipython:: python :okexcept: - s.loc['c':'e' + 1] + s.loc['c' : 'e' + 1] A very common use case is to limit a time series to start and end at two specific dates. To enable this, we made the design choice to make label-based diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index eed3fc149263a..ae20c281f227b 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -898,6 +898,7 @@ method. def subtract_and_divide(x, sub, divide=1): return (x - sub) / divide + df_udf = pd.DataFrame(np.ones((2, 2))) df_udf.apply(subtract_and_divide, args=(5,), divide=3) @@ -1140,9 +1141,11 @@ a single value and returning a single value. For example: df4 = df.copy() df4 + def f(x): return len(str(x)) + df4["one"].map(f) df4.map(f) @@ -1152,9 +1155,7 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = pd.Series( - ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"] - ) + s = pd.Series(["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]) t = pd.Series({"six": 6.0, "seven": 7.0}) s s.map(t) @@ -1438,9 +1439,7 @@ labels). df = pd.DataFrame( {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]}, - index=pd.MultiIndex.from_product( - [["a", "b", "c"], [1, 2]], names=["let", "num"] - ), + index=pd.MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["let", "num"]), ) df df.rename_axis(index={"let": "abc"}) @@ -1749,9 +1748,7 @@ used to sort a pandas object by its index levels. } ) - unsorted_df = df.reindex( - index=["a", "d", "c", "b"], columns=["three", "two", "one"] - ) + unsorted_df = df.reindex(index=["a", "d", "c", "b"], columns=["three", "two", "one"]) unsorted_df # DataFrame @@ -1795,9 +1792,7 @@ to use to determine the sorted order. .. ipython:: python - df1 = pd.DataFrame( - {"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]} - ) + df1 = pd.DataFrame({"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]}) df1.sort_values(by="two") The ``by`` parameter can take a list of column names, e.g.: @@ -1940,9 +1935,7 @@ all levels to ``by``. .. ipython:: python - df1.columns = pd.MultiIndex.from_tuples( - [("a", "one"), ("a", "two"), ("b", "three")] - ) + df1.columns = pd.MultiIndex.from_tuples([("a", "one"), ("a", "two"), ("b", "three")]) df1.sort_values(by=("a", "two")) @@ -2082,7 +2075,9 @@ different numeric dtypes will **NOT** be combined. The following example will gi { "A": pd.Series(np.random.randn(8), dtype="float16"), "B": pd.Series(np.random.randn(8)), - "C": pd.Series(np.random.randint(0, 255, size=8), dtype="uint8"), # [0,255] (range of uint8) + "C": pd.Series( + np.random.randint(0, 255, size=8), dtype="uint8" + ), # [0,255] (range of uint8) } ) df2 diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 3dfc6534f2b64..f5d3ac0f6277a 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -262,9 +262,7 @@ New columns .. ipython:: python - df = pd.DataFrame( - {"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]} - ) + df = pd.DataFrame({"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]}) df Method 1 : idxmin() to get the index of the minimums @@ -325,9 +323,7 @@ Arithmetic .. ipython:: python - cols = pd.MultiIndex.from_tuples( - [(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]] - ) + cols = pd.MultiIndex.from_tuples([(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]]) df = pd.DataFrame(np.random.randn(2, 6), index=["n", "m"], columns=cols) df df = df.div(df["C"], level=1) @@ -459,7 +455,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) + df.groupby("animal").apply( + lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False + ) `Using get_group `__ @@ -492,12 +490,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to S = pd.Series([i / 100.0 for i in range(1, 11)]) + def cum_ret(x, y): return x * (1 + y) + def red(x): return functools.reduce(cum_ret, x, 1.0) + S.expanding().apply(red, raw=True) @@ -509,10 +510,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) gb = df.groupby("A") + def replace(g): mask = g < 0 return g.where(~mask, g[~mask].mean()) + gb.transform(replace) `Sort groups by aggregated data @@ -544,11 +547,13 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to rng = pd.date_range(start="2014-10-07", periods=10, freq="2min") ts = pd.Series(data=list(range(10)), index=rng) + def MyCust(x): if len(x) > 2: return x.iloc[1] * 1.234 return pd.NaT + mhc = {"Mean": "mean", "Max": "max", "Custom": MyCust} ts.resample("5min").apply(mhc) ts @@ -558,9 +563,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame( - {"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]} - ) + df = pd.DataFrame({"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]}) df df["Counts"] = df.groupby(["Color"]).transform(len) df @@ -642,10 +645,7 @@ Create a list of dataframes, split using a delineation based on logic included i dfs = list( zip( *df.groupby( - (1 * (df["Case"] == "B")) - .cumsum() - .rolling(window=3, min_periods=1) - .median() + (1 * (df["Case"] == "B")).cumsum().rolling(window=3, min_periods=1).median() ) ) )[-1] @@ -794,12 +794,12 @@ Apply index=["I", "II", "III"], ) + def SeriesFromSubList(aList): return pd.Series(aList) - df_orgz = pd.concat( - {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} - ) + + df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}) df_orgz `Rolling apply with a DataFrame returning a Series @@ -816,13 +816,15 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc ) df + def gm(df, const): v = ((((df["A"] + df["B"]) + 1).cumprod()) - 1) * const return v.iloc[-1] + s = pd.Series( { - df.index[i]: gm(df.iloc[i: min(i + 51, len(df) - 1)], 5) + df.index[i]: gm(df.iloc[i : min(i + 51, len(df) - 1)], 5) for i in range(len(df) - 50) } ) @@ -846,13 +848,15 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight ) df + def vwap(bars): return (bars.Close * bars.Volume).sum() / bars.Volume.sum() + window = 5 s = pd.concat( [ - (pd.Series(vwap(df.iloc[i: i + window]), index=[df.index[i + window]])) + (pd.Series(vwap(df.iloc[i : i + window]), index=[df.index[i + window]])) for i in range(len(df) - window) ] ) @@ -1406,8 +1410,8 @@ The ``method`` argument within ``DataFrame.corr`` can accept a callable in addit A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean()) B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean()) cov_ab = np.sqrt(np.nansum(A * B)) / n - std_a = np.sqrt(np.sqrt(np.nansum(A ** 2)) / n) - std_b = np.sqrt(np.sqrt(np.nansum(B ** 2)) / n) + std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n) + std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n) return cov_ab / std_a / std_b diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 9757a72f13fa8..64f388b8c0a4b 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -658,7 +658,7 @@ Arithmetic operations with scalars operate element-wise: df * 5 + 2 1 / df - df ** 4 + df**4 .. _dsintro.boolean: diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 7894789846ce8..4ff023c3ebcae 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -129,9 +129,10 @@ This applies to both row and column labels for a :class:`DataFrame` .. ipython:: python :okexcept: - pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags( - allows_duplicate_labels=False - ) + pd.DataFrame( + [[0, 1, 2], [3, 4, 5]], + columns=["A", "B", "C"], + ).set_flags(allows_duplicate_labels=False) This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, which indicates whether that object can have duplicate labels. diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 99c85ac66623d..a8a72e038e9a2 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -199,6 +199,7 @@ Here is a similar example with :meth:`DataFrame.apply`: s.pop("a") return s + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df.apply(f, axis="columns") @@ -225,6 +226,7 @@ not apply to the container being iterated over. s.pop("a") return s + df = pd.DataFrame({"a": [1, 2, 3], 'b': [4, 5, 6]}) df.apply(f, axis="columns") diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 7f957a8b16787..5a6aedb1050bd 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -267,9 +267,7 @@ the number of groups, which is the same as the length of the ``groups`` dictiona height = np.random.normal(60, 10, size=n) time = pd.date_range("1/1/2000", periods=n) gender = np.random.choice(["male", "female"], size=n) - df = pd.DataFrame( - {"height": height, "weight": weight, "gender": gender}, index=time - ) + df = pd.DataFrame({"height": height, "weight": weight, "gender": gender}, index=time) df gb = df.groupby("gender") @@ -679,9 +677,9 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python ( - grouped[["C", "D"]].agg(["sum", "mean", "std"]).rename( - columns={"sum": "foo", "mean": "bar", "std": "baz"} - ) + grouped[["C", "D"]] + .agg(["sum", "mean", "std"]) + .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) ) .. note:: @@ -747,9 +745,7 @@ and unpack the keyword arguments .. ipython:: python animals.groupby("kind").agg( - **{ - "total weight": pd.NamedAgg(column="weight", aggfunc="sum") - } + **{"total weight": pd.NamedAgg(column="weight", aggfunc="sum")} ) When using named aggregation, additional keyword arguments are not passed through @@ -910,9 +906,7 @@ Suppose we wish to standardize the data within each group: ts.head() ts.tail() - transformed = ts.groupby(lambda x: x.year).transform( - lambda x: (x - x.mean()) / x.std() - ) + transformed = ts.groupby(lambda x: x.year).transform(lambda x: (x - x.mean()) / x.std()) We would expect the result to now have mean 0 and standard deviation 1 within @@ -1196,9 +1190,10 @@ The dimension of the returned result can also change: grouped = df.groupby('A')['C'] + def f(group): - return pd.DataFrame({'original': group, - 'demeaned': group - group.mean()}) + return pd.DataFrame({'original': group, 'demeaned': group - group.mean()}) + grouped.apply(f) @@ -1208,7 +1203,7 @@ that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python def f(x): - return pd.Series([x, x ** 2], index=["x", "x^2"]) + return pd.Series([x, x**2], index=["x", "x^2"]) s = pd.Series(np.random.rand(5)) @@ -1369,10 +1364,10 @@ end of the result in order. categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], ) data = pd.DataFrame( - { - "day": days, - "workers": [3, 4, 1, 4, 2, 2], - } + { + "day": days, + "workers": [3, 4, 1, 4, 2, 2], + } ) data @@ -1713,10 +1708,12 @@ column index name will be used as the name of the inserted column: } ) + def compute_metrics(x): result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") + result = df.groupby("a").apply(compute_metrics, include_groups=False) result diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index f0d6a76f0de5b..aed54097ceb45 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -32,6 +32,7 @@ or: .. ipython:: python import pandas as pd + pd.DataFrame({'A': [1, 2, 3]}) The first block is a standard python input, while in the second the ``In [1]:`` indicates the input is inside a `notebook `__. In Jupyter Notebooks the last line is printed and plots are shown inline. diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 24cdbad41fe60..397122715acdd 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -127,8 +127,7 @@ indexing functionality: .. ipython:: python dates = pd.date_range('1/1/2000', periods=8) - df = pd.DataFrame(np.random.randn(8, 4), - index=dates, columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df .. note:: @@ -184,7 +183,7 @@ columns. df[['A', 'B']] df.iloc[:, [1, 0]] = df[['A', 'B']] - df[['A','B']] + df[['A', 'B']] Attribute access @@ -250,7 +249,7 @@ new column and will this raise a ``UserWarning``: .. ipython:: python :okwarning: - df_new = pd.DataFrame({'one': [1., 2., 3.]}) + df_new = pd.DataFrame({'one': [1.0, 2.0, 3.0]}) df_new.two = [4, 5, 6] df_new @@ -300,9 +299,11 @@ Selection by label .. ipython:: python :okexcept: - dfl = pd.DataFrame(np.random.randn(5, 4), - columns=list('ABCD'), - index=pd.date_range('20130101', periods=5)) + dfl = pd.DataFrame( + np.random.randn(5, 4), + columns=list('ABCD'), + index=pd.date_range('20130101', periods=5), + ) dfl dfl.loc[2:3] @@ -345,9 +346,7 @@ With a DataFrame: .. ipython:: python - df1 = pd.DataFrame(np.random.randn(6, 4), - index=list('abcdef'), - columns=list('ABCD')) + df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'), columns=list('ABCD')) df1 df1.loc[['a', 'b', 'd'], :] @@ -462,9 +461,9 @@ With a DataFrame: .. ipython:: python - df1 = pd.DataFrame(np.random.randn(6, 4), - index=list(range(0, 12, 2)), - columns=list(range(0, 8, 2))) + df1 = pd.DataFrame( + np.random.randn(6, 4), index=list(range(0, 12, 2)), columns=list(range(0, 8, 2)) + ) df1 Select via integer slicing: @@ -554,9 +553,7 @@ The ``callable`` must be a function with one argument (the calling Series or Dat .. ipython:: python - df1 = pd.DataFrame(np.random.randn(6, 4), - index=list('abcdef'), - columns=list('ABCD')) + df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'), columns=list('ABCD')) df1 df1.loc[lambda df: df['A'] > 0, :] @@ -579,8 +576,7 @@ without using a temporary variable. .. ipython:: python bb = pd.read_csv('data/baseball.csv', index_col='id') - (bb.groupby(['year', 'team']).sum(numeric_only=True) - .loc[lambda df: df['r'] > 100]) + (bb.groupby(['year', 'team']).sum(numeric_only=True).loc[lambda df: df['r'] > 100]) .. _combining_positional_and_label_based_indexing: @@ -592,9 +588,7 @@ If you wish to get the 0th and the 2nd elements from the index in the 'A' column .. ipython:: python - dfd = pd.DataFrame({'A': [1, 2, 3], - 'B': [4, 5, 6]}, - index=list('abc')) + dfd = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=list('abc')) dfd dfd.loc[dfd.index[[0, 2]], 'A'] @@ -708,8 +702,7 @@ as a string. .. ipython:: python - df2 = pd.DataFrame({'col1': [9, 8, 7, 6], - 'weight_column': [0.5, 0.4, 0.1, 0]}) + df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]}) df2.sample(n=3, weights='weight_column') ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. @@ -742,15 +735,14 @@ In the ``Series`` case this is effectively an appending operation. se = pd.Series([1, 2, 3]) se - se[5] = 5. + se[5] = 5.0 se A ``DataFrame`` can be enlarged on either axis via ``.loc``. .. ipython:: python - dfi = pd.DataFrame(np.arange(6).reshape(3, 2), - columns=['A', 'B']) + dfi = pd.DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B']) dfi dfi.loc[:, 'C'] = dfi.loc[:, 'A'] dfi @@ -830,9 +822,13 @@ more complex criteria: .. ipython:: python - df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'], - 'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'], - 'c': np.random.randn(7)}) + df2 = pd.DataFrame( + { + 'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c': np.random.randn(7), + } + ) # only want 'two' or 'three' criterion = df2['a'].map(lambda x: x.startswith('t')) @@ -860,10 +856,8 @@ and :ref:`Advanced Indexing ` you may select along more than one axis .. ipython:: python - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], - index=list('abc'), - columns=['A', 'B']) - s = (df['A'] > 2) + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], index=list('abc'), columns=['A', 'B']) + s = df['A'] > 2 s df.loc[s, 'B'] @@ -901,8 +895,9 @@ in the membership check: .. ipython:: python - s_mi = pd.Series(np.arange(6), - index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']])) + s_mi = pd.Series( + np.arange(6), index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]) + ) s_mi s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])] s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)] @@ -914,8 +909,9 @@ wherever the element is in the sequence of values. .. ipython:: python - df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], - 'ids2': ['a', 'n', 'c', 'n']}) + df = pd.DataFrame( + {'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], 'ids2': ['a', 'n', 'c', 'n']} + ) values = ['a', 'b', 1, 3] @@ -980,8 +976,7 @@ The code below is equivalent to ``df.where(df < 0)``. .. ipython:: python dates = pd.date_range('1/1/2000', periods=8) - df = pd.DataFrame(np.random.randn(8, 4), - index=dates, columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df[df < 0] In addition, ``where`` takes an optional ``other`` argument for replacement of @@ -1048,9 +1043,7 @@ as condition and ``other`` argument. .. ipython:: python - df3 = pd.DataFrame({'A': [1, 2, 3], - 'B': [4, 5, 6], - 'C': [7, 8, 9]}) + df3 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) df3.where(lambda x: x > 4, lambda x: x + 10) Mask @@ -1091,7 +1084,7 @@ as a fallback, you can do the following. conditions = [ (df['col2'] == 'Z') & (df['col1'] == 'A'), (df['col2'] == 'Z') & (df['col1'] == 'B'), - (df['col1'] == 'B') + (df['col1'] == 'B'), ] choices = ['yellow', 'blue', 'purple'] df['color'] = np.select(conditions, choices, default='black') @@ -1252,9 +1245,14 @@ The ``in`` and ``not in`` operators .. ipython:: python # get all rows where columns "a" and "b" have overlapping values - df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), - 'c': np.random.randint(5, size=12), - 'd': np.random.randint(9, size=12)}) + df = pd.DataFrame( + { + 'a': list('aabbccddeeff'), + 'b': list('aaaabbbbcccc'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12), + } + ) df df.query('a in b') @@ -1342,10 +1340,9 @@ Of course, expressions can be arbitrarily complex too: shorter = df.query('a < b < c and (not bools) or bools > 2') # equivalent in pure Python - longer = df[(df['a'] < df['b']) - & (df['b'] < df['c']) - & (~df['bools']) - | (df['bools'] > 2)] + longer = df[ + (df['a'] < df['b']) & (df['b'] < df['c']) & (~df['bools']) | (df['bools'] > 2) + ] shorter longer @@ -1377,8 +1374,7 @@ floating point values generated using ``numpy.random.randn()``. .. ipython:: python - df = pd.DataFrame(np.random.randn(8, 4), - index=dates, columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df2 = df.copy() @@ -1403,9 +1399,13 @@ each method has a ``keep`` parameter to specify targets to be kept. .. ipython:: python - df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'], - 'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'], - 'c': np.random.randn(7)}) + df2 = pd.DataFrame( + { + 'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'], + 'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'], + 'c': np.random.randn(7), + } + ) df2 df2.duplicated('a') df2.duplicated('a', keep='last') @@ -1426,9 +1426,9 @@ The same set of options are available for the ``keep`` parameter. .. ipython:: python - df3 = pd.DataFrame({'a': np.arange(6), - 'b': np.random.randn(6)}, - index=['a', 'a', 'b', 'c', 'b', 'a']) + df3 = pd.DataFrame( + {'a': np.arange(6), 'b': np.random.randn(6)}, index=['a', 'a', 'b', 'c', 'b', 'a'] + ) df3 df3.index.duplicated() df3[~df3.index.duplicated()] @@ -1460,9 +1460,9 @@ For instance: .. ipython:: python - df = pd.DataFrame({'col': ["A", "A", "B", "B"], - 'A': [80, 23, np.nan, 22], - 'B': [80, 55, 76, 67]}) + df = pd.DataFrame( + {'col': ["A", "A", "B", "B"], 'A': [80, 23, np.nan, 22], 'B': [80, 55, 76, 67]} + ) df idx, cols = pd.factorize(df['col']) df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] @@ -1551,7 +1551,9 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. .. ipython:: python - index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) + index = pd.MultiIndex.from_product( + [range(3), ['one', 'two']], names=['first', 'second'] + ) index index.levels[1] index.set_levels(["a", "b"], level=1) @@ -1615,9 +1617,9 @@ Missing values idx1 idx1.fillna(2) - idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2011-01-03')]) + idx2 = pd.DatetimeIndex( + [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + ) idx2 idx2.fillna(pd.Timestamp('2011-01-02')) @@ -1639,10 +1641,14 @@ To create a new, re-indexed DataFrame: .. ipython:: python - data = pd.DataFrame({'a': ['bar', 'bar', 'foo', 'foo'], - 'b': ['one', 'two', 'one', 'two'], - 'c': ['z', 'y', 'x', 'w'], - 'd': [1., 2., 3, 4]}) + data = pd.DataFrame( + { + 'a': ['bar', 'bar', 'foo', 'foo'], + 'b': ['one', 'two', 'one', 'two'], + 'c': ['z', 'y', 'x', 'w'], + 'd': [1.0, 2.0, 3, 4], + } + ) data indexed1 = data.set_index('c') indexed1 diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9c48e66daacf0..8dce6a9886ce1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1075,14 +1075,20 @@ writing to a file). For example: StringIO(data), engine="c", float_precision=None, - )["c"][0] - float(val) + )[ + "c" + ][0] + - float(val) ) abs( pd.read_csv( StringIO(data), engine="c", float_precision="high", - )["c"][0] - float(val) + )[ + "c" + ][0] + - float(val) ) abs( pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0] @@ -1235,9 +1241,13 @@ The bad line will be a list of strings that was split by the ``sep``: .. ipython:: python external_list = [] + + def bad_lines_func(line): external_list.append(line) return line[-3:] + + pd.read_csv(StringIO(data), on_bad_lines=bad_lines_func, engine="python") external_list @@ -1670,7 +1680,7 @@ of header key value mappings to the ``storage_options`` keyword argument as show df = pd.read_csv( "https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t", - storage_options=headers + storage_options=headers, ) All URLs which are not local files or HTTP(s) are handled by @@ -2097,6 +2107,7 @@ Reading from a JSON string: .. ipython:: python from io import StringIO + pd.read_json(StringIO(json)) Reading from a file: @@ -2122,6 +2133,7 @@ Preserve string indices: .. ipython:: python from io import StringIO + si = pd.DataFrame( np.zeros((4, 4)), columns=list(range(4)), index=[str(i) for i in range(4)] ) @@ -2140,6 +2152,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: .. ipython:: python from io import StringIO + json = dfj2.to_json(date_unit="ns") # Try to parse timestamps as milliseconds -> Won't Work @@ -2159,9 +2172,9 @@ By setting the ``dtype_backend`` argument you can control the default dtypes use .. ipython:: python data = ( - '{"a":{"0":1,"1":3},"b":{"0":2.5,"1":4.5},"c":{"0":true,"1":false},"d":{"0":"a","1":"b"},' - '"e":{"0":null,"1":6.0},"f":{"0":null,"1":7.5},"g":{"0":null,"1":true},"h":{"0":null,"1":"a"},' - '"i":{"0":"12-31-2019","1":"12-31-2019"},"j":{"0":null,"1":null}}' + '{"a":{"0":1,"1":3},"b":{"0":2.5,"1":4.5},"c":{"0":true,"1":false},"d":{"0":"a","1":"b"},' + '"e":{"0":null,"1":6.0},"f":{"0":null,"1":7.5},"g":{"0":null,"1":true},"h":{"0":null,"1":"a"},' + '"i":{"0":"12-31-2019","1":"12-31-2019"},"j":{"0":null,"1":null}}' ) df = pd.read_json(StringIO(data), dtype_backend="pyarrow") df @@ -2240,6 +2253,7 @@ For line-delimited json files, pandas can also return an iterator which reads in .. ipython:: python from io import StringIO + jsonl = """ {"a": 1, "b": 2} {"a": 3, "b": 4} @@ -2259,6 +2273,7 @@ Line-limited json can also be read using the pyarrow reader by specifying ``engi .. ipython:: python from io import BytesIO + df = pd.read_json(BytesIO(jsonl.encode()), lines=True, engine="pyarrow") df @@ -2689,10 +2704,7 @@ Links can be extracted from cells along with the text using ``extract_links="all """ - df = pd.read_html( - StringIO(html_table), - extract_links="all" - )[0] + df = pd.read_html(StringIO(html_table), extract_links="all")[0] df df[("GitHub", None)] df[("GitHub", None)].str[1] @@ -2946,7 +2958,8 @@ Read an XML string: .. ipython:: python - from io import StringIO + from io import StringIO + xml = """ @@ -3081,9 +3094,9 @@ For example, below XML contains a namespace with prefix, ``doc``, and URI at """ - df = pd.read_xml(StringIO(xml), - xpath="//doc:row", - namespaces={"doc": "https://example.com"}) + df = pd.read_xml( + StringIO(xml), xpath="//doc:row", namespaces={"doc": "https://example.com"} + ) df Similarly, an XML document can have a default namespace without prefix. Failing @@ -3111,9 +3124,9 @@ But assigning *any* temporary name to correct URI allows parsing by nodes. """ - df = pd.read_xml(StringIO(xml), - xpath="//pandas:row", - namespaces={"pandas": "https://example.com"}) + df = pd.read_xml( + StringIO(xml), xpath="//pandas:row", namespaces={"pandas": "https://example.com"} + ) df However, if XPath does not reference node names such as default, ``/*``, then @@ -3340,12 +3353,7 @@ Write a mix of elements and attributes: .. ipython:: python - print( - geom_df.to_xml( - index=False, - attr_cols=['shape'], - elem_cols=['degrees', 'sides']) - ) + print(geom_df.to_xml(index=False, attr_cols=['shape'], elem_cols=['degrees', 'sides'])) Any ``DataFrames`` with hierarchical columns will be flattened for XML element names with levels delimited by underscores: @@ -3361,10 +3369,9 @@ with levels delimited by underscores: } ) - pvt_df = ext_geom_df.pivot_table(index='shape', - columns='type', - values=['degrees', 'sides'], - aggfunc='sum') + pvt_df = ext_geom_df.pivot_table( + index='shape', columns='type', values=['degrees', 'sides'], aggfunc='sum' + ) pvt_df print(pvt_df.to_xml()) @@ -3379,19 +3386,13 @@ Write an XML with namespace prefix: .. ipython:: python - print( - geom_df.to_xml(namespaces={"doc": "https://example.com"}, - prefix="doc") - ) + print(geom_df.to_xml(namespaces={"doc": "https://example.com"}, prefix="doc")) Write an XML without declaration or pretty print: .. ipython:: python - print( - geom_df.to_xml(xml_declaration=False, - pretty_print=False) - ) + print(geom_df.to_xml(xml_declaration=False, pretty_print=False)) Write an XML and transform with stylesheet: @@ -3920,7 +3921,9 @@ The look and feel of Excel worksheets created from pandas can be modified using .. code-block:: python css = "border: 1px solid black; font-weight: bold;" - df.style.map_index(lambda x: css).map_index(lambda x: css, axis=1).to_excel("myfile.xlsx") + df.style.map_index(lambda x: css).map_index(lambda x: css, axis=1).to_excel( + "myfile.xlsx" + ) Using the `Xlsxwriter`_ engine provides many options for controlling the format of an Excel worksheet created with the ``to_excel`` method. Excellent examples can be found in the @@ -4400,7 +4403,7 @@ will yield a tuple for each group key along with the relative keys of its conten .. ipython:: python - for (path, subgroups, subkeys) in store.walk(): + for path, subgroups, subkeys in store.walk(): for subgroup in subgroups: print("GROUP: {}/{}".format(path, subgroup)) for subkey in subkeys: @@ -4483,9 +4486,9 @@ storing/selecting from homogeneous index ``DataFrames``. .. ipython:: python index = pd.MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["foo", "bar"], + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], ) df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) df_mi @@ -4653,8 +4656,7 @@ specified in the format: ``()``, where float may be signed (and fra { "A": pd.Timestamp("20130101"), "B": [ - pd.Timestamp("20130101") + timedelta(days=i, seconds=10) - for i in range(10) + pd.Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) ], } ) @@ -4826,8 +4828,10 @@ chunks. store.append("dfeq", dfeq, data_columns=["number"]) + def chunks(l, n): - return [l[i: i + n] for i in range(0, len(l), n)] + return [l[i : i + n] for i in range(0, len(l), n)] + evens = [2, 4, 6, 8, 10] coordinates = store.select_as_coordinates("dfeq", "number=evens") @@ -5408,7 +5412,9 @@ By setting the ``dtype_backend`` argument you can control the default dtypes use .. ipython:: python - result = pd.read_parquet("example_pa.parquet", engine="pyarrow", dtype_backend="pyarrow") + result = pd.read_parquet( + "example_pa.parquet", engine="pyarrow", dtype_backend="pyarrow" + ) result.dtypes @@ -5644,7 +5650,7 @@ to connect to your database. # Create the connection with sqlite_dbapi.connect("sqlite:///:memory:") as conn: - df = pd.read_sql_table("data", conn) + df = pd.read_sql_table("data", conn) To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine object from database URI. You only need to create the engine once per database you are diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 1edf3908936db..7af8bf7c39d9d 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -412,20 +412,20 @@ either the left or right tables, the values in the joined table will be .. ipython:: python left = pd.DataFrame( - { - "key1": ["K0", "K0", "K1", "K2"], - "key2": ["K0", "K1", "K0", "K1"], - "A": ["A0", "A1", "A2", "A3"], - "B": ["B0", "B1", "B2", "B3"], - } + { + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } ) right = pd.DataFrame( - { - "key1": ["K0", "K1", "K1", "K2"], - "key2": ["K0", "K0", "K0", "K0"], - "C": ["C0", "C1", "C2", "C3"], - "D": ["D0", "D1", "D2", "D3"], - } + { + "key1": ["K0", "K1", "K1", "K2"], + "key2": ["K0", "K0", "K0", "K0"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } ) result = pd.merge(left, right, how="left", on=["key1", "key2"]) result @@ -696,9 +696,7 @@ aligned. .. ipython:: python - result = pd.merge( - left, right, left_on="key", right_index=True, how="left", sort=False - ) + result = pd.merge(left, right, left_on="key", right_index=True, how="left", sort=False) result .. ipython:: python @@ -808,9 +806,7 @@ in the join and is a subset of the indices in the left argument. left = pd.DataFrame({"v1": range(12)}, index=leftindex) left - rightindex = pd.MultiIndex.from_product( - [list("abc"), list("xy")], names=["abc", "xy"] - ) + rightindex = pd.MultiIndex.from_product([list("abc"), list("xy")], names=["abc", "xy"]) right = pd.DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) right @@ -821,9 +817,7 @@ in the join and is a subset of the indices in the left argument. leftindex = pd.MultiIndex.from_tuples( [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] ) - left = pd.DataFrame( - {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex - ) + left = pd.DataFrame({"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex) rightindex = pd.MultiIndex.from_tuples( [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] @@ -937,9 +931,7 @@ location. .. ipython:: python - df1 = pd.DataFrame( - [[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]] - ) + df1 = pd.DataFrame([[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]]) df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) result = df1.combine_first(df2) result diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index aea7688c062b8..184f27253c388 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -129,8 +129,8 @@ operands is ``NA``. .. ipython:: python - pd.NA ** 0 - 1 ** pd.NA + pd.NA**0 + 1**pd.NA In equality and comparison operations, :class:`NA` also propagates. This deviates from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always @@ -264,6 +264,7 @@ the first 10 columns. .. ipython:: python import io + data = io.StringIO("a,b\n,True\n2,") df = pd.read_csv(data) df.dtypes @@ -281,7 +282,7 @@ The missing value sentinel used will be chosen based on the dtype. .. ipython:: python - ser = pd.Series([1., 2., 3.]) + ser = pd.Series([1.0, 2.0, 3.0]) ser.loc[0] = None ser @@ -381,7 +382,10 @@ Replace NA with a scalar value .. ipython:: python - data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")} + data = { + "np": [1.0, np.nan, np.nan, 2], + "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]"), + } df = pd.DataFrame(data) df df.fillna(0) @@ -493,10 +497,10 @@ The appropriate interpolation method will depend on the data type. .. ipython:: python df = pd.DataFrame( - { - "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], - "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], - } + { + "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], + "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], + } ) df df.interpolate(method="barycentric") diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 61b383afb7c43..c5f9526c6b374 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -45,6 +45,7 @@ which is similar to a NumPy array. To construct these from the main pandas data .. ipython:: python import pyarrow as pa + data = list("abc") ser_sd = pd.Series(data, dtype="string[pyarrow]") ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string())) @@ -58,6 +59,7 @@ into :class:`ArrowDtype` to use in the ``dtype`` parameter. .. ipython:: python import pyarrow as pa + list_str_type = pa.list_(pa.string()) ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type)) ser @@ -65,12 +67,14 @@ into :class:`ArrowDtype` to use in the ``dtype`` parameter. .. ipython:: python from datetime import time + idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us"))) idx .. ipython:: python from decimal import Decimal + decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2)) data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] df = pd.DataFrame(data, dtype=decimal_type) @@ -131,6 +135,7 @@ The following are just some examples of operations that are accelerated by nativ .. ipython:: python import pyarrow as pa + ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]") ser.mean() ser + ser @@ -148,6 +153,7 @@ The following are just some examples of operations that are accelerated by nativ .. ipython:: python from datetime import datetime + pa_type = pd.ArrowDtype(pa.timestamp("ns")) ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type) ser_dt.dt.strftime("%Y-%m") @@ -166,10 +172,13 @@ functions provide an ``engine`` keyword that can dispatch to PyArrow to accelera .. ipython:: python import io - data = io.StringIO("""a,b,c + + data = io.StringIO( + """a,b,c 1,2.5,True 3,4.5,False - """) + """ + ) df = pd.read_csv(data, engine="pyarrow") df @@ -180,10 +189,13 @@ PyArrow-backed data by specifying the parameter ``dtype_backend="pyarrow"``. A r .. ipython:: python import io - data = io.StringIO("""a,b,c,d,e,f,g,h,i + + data = io.StringIO( + """a,b,c,d,e,f,g,h,i 1,2.5,True,a,,,,, 3,4.5,False,b,6,7.5,True,a, - """) + """ + ) df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow") df_pyarrow.dtypes diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 3347f3a2534f4..333eb98399d40 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -37,9 +37,9 @@ multiple rows for each subject where applicable. .. ipython:: python data = { - "value": range(12), - "variable": ["A"] * 3 + ["B"] * 3 + ["C"] * 3 + ["D"] * 3, - "date": pd.to_datetime(["2020-01-03", "2020-01-04", "2020-01-05"] * 4) + "value": range(12), + "variable": ["A"] * 3 + ["B"] * 3 + ["C"] * 3 + ["D"] * 3, + "date": pd.to_datetime(["2020-01-03", "2020-01-04", "2020-01-05"] * 4), } df = pd.DataFrame(data) @@ -112,13 +112,15 @@ strategies. df pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) pd.pivot_table( - df, values=["D", "E"], + df, + values=["D", "E"], index=["B"], columns=["A", "C"], aggfunc="sum", ) pd.pivot_table( - df, values="E", + df, + values="E", index=["B", "C"], columns=["A"], aggfunc=["sum", "mean"], @@ -150,11 +152,7 @@ rows and columns: .. ipython:: python table = df.pivot_table( - index=["A", "B"], - columns="C", - values=["D", "E"], - margins=True, - aggfunc="std" + index=["A", "B"], columns="C", values=["D", "E"], margins=True, aggfunc="std" ) table @@ -190,8 +188,8 @@ Closely related to the :meth:`~DataFrame.pivot` method are the related .. ipython:: python tuples = [ - ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], - ["one", "two", "one", "two", "one", "two", "one", "two"], + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], ] index = pd.MultiIndex.from_arrays(tuples, names=["first", "second"]) df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index b262de5d71439..07f57d55b0c13 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -24,6 +24,7 @@ Suppose our raw dataset on disk has many columns. import pandas as pd import numpy as np + def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") n = len(index) @@ -39,6 +40,7 @@ Suppose our raw dataset on disk has many columns. df = df.iloc[:-1] return df + timeseries = [ make_timeseries(freq="1min", seed=i).rename(columns=lambda x: f"{x}_{i}") for i in range(10) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 25bcb8bcc0c93..9c8ca9205898d 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -99,8 +99,7 @@ passed instead .. ipython:: python - pd.SparseDtype(np.dtype('datetime64[ns]'), - fill_value=pd.Timestamp('2017-01-01')) + pd.SparseDtype(np.dtype('datetime64[ns]'), fill_value=pd.Timestamp('2017-01-01')) Finally, the string alias ``'Sparse[dtype]'`` may be used to specify a sparse dtype in many places @@ -140,7 +139,7 @@ to :class:`arrays.SparseArray` and get a :class:`arrays.SparseArray` as a result .. ipython:: python - arr = pd.arrays.SparseArray([1., np.nan, np.nan, -2., np.nan]) + arr = pd.arrays.SparseArray([1.0, np.nan, np.nan, -2.0, np.nan]) np.abs(arr) @@ -149,7 +148,7 @@ the correct dense result. .. ipython:: python - arr = pd.arrays.SparseArray([1., -1, -1, -2., -1], fill_value=-1) + arr = pd.arrays.SparseArray([1.0, -1, -1, -2.0, -1], fill_value=-1) np.abs(arr) np.abs(arr).to_dense() @@ -183,7 +182,7 @@ Use :meth:`DataFrame.sparse.from_spmatrix` to create a :class:`DataFrame` with s from scipy.sparse import csr_matrix arr = np.random.random(size=(1000, 5)) - arr[arr < .9] = 0 + arr[arr < 0.9] = 0 sp_arr = csr_matrix(arr) sp_arr @@ -251,6 +250,7 @@ A convenience method :meth:`Series.sparse.from_coo` is implemented for creating .. ipython:: python from scipy import sparse + A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)) A A.todense() diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index cf27fc8385223..33c73f5870bf4 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -294,9 +294,11 @@ positional argument (a regex object) and return a string. # Reverse every lowercase alphabetic word pat = r"[a-z]+" + def repl(m): return m.group(0)[::-1] + pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace( pat, repl, regex=True ) @@ -304,12 +306,12 @@ positional argument (a regex object) and return a string. # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" + def repl(m): return m.group("two").swapcase() - pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace( - pat, repl, regex=True - ) + + pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(pat, repl, regex=True) The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 5daf204f39bcf..02e51f7f585eb 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -63,8 +63,10 @@ Further, operations among the scalars yield another scalar ``Timedelta``. .. ipython:: python - pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) + pd.Timedelta( - "00:00:00.000123" + ( + pd.Timedelta(pd.offsets.Day(2)) + + pd.Timedelta(pd.offsets.Second(2)) + + pd.Timedelta("00:00:00.000123") ) to_timedelta diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0f0e6271d8329..485e028be978f 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -314,9 +314,7 @@ which can be specified. These are computed from the starting point specified by .. ipython:: python - pd.to_datetime( - [1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s" - ) + pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s") pd.to_datetime( [1349720105100, 1349720105200, 1349720105300, 1349720105400, 1349720105500], @@ -568,7 +566,7 @@ Dates and strings that parse to timestamps can be passed as indexing parameters: ts["1/31/2011"] - ts[datetime.datetime(2011, 12, 25):] + ts[datetime.datetime(2011, 12, 25) :] ts["10/31/2011":"12/31/2011"] @@ -702,9 +700,7 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python - dft_minute = pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index - ) + dft_minute = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index) dft_minute.loc["2011-12-31 23"] @@ -738,16 +734,14 @@ These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and .. ipython:: python - dft[datetime.datetime(2013, 1, 1): datetime.datetime(2013, 2, 28)] + dft[datetime.datetime(2013, 1, 1) : datetime.datetime(2013, 2, 28)] With no defaults. .. ipython:: python dft[ - datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime( - 2013, 2, 28, 10, 12, 0 - ) + datetime.datetime(2013, 1, 1, 10, 12, 0) : datetime.datetime(2013, 2, 28, 10, 12, 0) ] Truncating & fancy indexing @@ -1484,6 +1478,7 @@ An example of how holidays and holiday calendars are defined: MO, ) + class ExampleCalendar(AbstractHolidayCalendar): rules = [ USMemorialDay, @@ -1496,6 +1491,7 @@ An example of how holidays and holiday calendars are defined: ), ] + cal = ExampleCalendar() cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31)) @@ -1767,12 +1763,14 @@ We can instead only resample those groups where we have points as follows: from functools import partial from pandas.tseries.frequencies import to_offset + def round(t, freq): # round a Timestamp to a specified freq freq = to_offset(freq) td = pd.Timedelta(freq) return pd.Timestamp((t.value // td.value) * td.value) + ts.groupby(partial(round, freq="3min")).sum() .. _timeseries.aggregate: @@ -2149,7 +2147,7 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI ps["2011-01"] - ps[datetime.datetime(2011, 12, 25):] + ps[datetime.datetime(2011, 12, 25) :] ps["10/31/2011":"12/31/2011"] @@ -2307,9 +2305,11 @@ To convert from an ``int64`` based YYYYMMDD representation. s = pd.Series([20121231, 20141130, 99991231]) s + def conv(x): return pd.Period(year=x // 10000, month=x // 100 % 100, day=x % 100, freq="D") + s.apply(conv) s.apply(conv)[2] diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 9081d13ef2cf1..77a6db75eb7b2 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1213,7 +1213,7 @@ it empty for ylabel. .. ipython:: python :suppress: - plt.figure(); + plt.figure() .. ipython:: python @@ -1428,7 +1428,7 @@ The above example is identical to using: .. ipython:: python - df.plot(subplots=True, layout=(2, -1), figsize=(6, 6), sharex=False); + df.plot(subplots=True, layout=(2, -1), figsize=(6, 6), sharex=False) .. ipython:: python :suppress: diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index e25c4c2441920..e95fee1b42ab9 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -96,8 +96,11 @@ be calculated with :meth:`~Rolling.apply` by specifying a separate column of wei arr[:, :2] = (x[:, :2] * x[:, 2]).sum(axis=0) / x[:, 2].sum() return arr + df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) - df.rolling(2, method="table", min_periods=0).apply(weighted_mean, raw=True, engine="numba") # noqa: E501 + df.rolling(2, method="table", min_periods=0).apply( + weighted_mean, raw=True, engine="numba" + ) # noqa: E501 .. versionadded:: 1.3 @@ -263,18 +266,20 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other from pandas.api.indexers import BaseIndexer + class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed, step): - start = np.empty(num_values, dtype=np.int64) - end = np.empty(num_values, dtype=np.int64) - for i in range(num_values): - if self.use_expanding[i]: - start[i] = 0 - end[i] = i + 1 - else: - start[i] = i - end[i] = i + self.window_size - return start, end + def get_window_bounds(self, num_values, min_periods, center, closed, step): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = i + 1 + else: + start[i] = i + end[i] = i + self.window_size + return start, end + indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) @@ -305,6 +310,7 @@ forward-looking rolling window, and we can use it as follows: .. ipython:: python from pandas.api.indexers import FixedForwardWindowIndexer + indexer = FixedForwardWindowIndexer(window_size=2) df.rolling(indexer, min_periods=1).sum() @@ -341,6 +347,7 @@ the windows are cast as :class:`Series` objects (``raw=False``) or ndarray objec def mad(x): return np.fabs(x - x.mean()).mean() + s = pd.Series(range(10)) s.rolling(window=4).apply(mad, raw=True) @@ -429,11 +436,7 @@ can even be omitted: .. ipython:: python - covs = ( - df[["B", "C", "D"]] - .rolling(window=4) - .cov(df[["A", "B", "C"]], pairwise=True) - ) + covs = df[["B", "C", "D"]].rolling(window=4).cov(df[["A", "B", "C"]], pairwise=True) covs diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index be50c34d7d14c..b236355c5444b 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -264,7 +264,7 @@ Convenience methods ``ffill`` and ``bfill`` have been added: .. ipython:: python def f(x): - return pd.Series([x, x ** 2], index=["x", "x^2"]) + return pd.Series([x, x**2], index=["x", "x^2"]) s = pd.Series(np.random.rand(5)) diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index 611ac2021fcec..d71a0d5ca68cd 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -180,9 +180,7 @@ combined result, by using ``where`` on a selector table. store.select("df2_mt") # as a multiple - store.select_as_multiple( - ["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt" - ) + store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index dcb0d3229aa5d..79aed0eac611a 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -77,9 +77,13 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') df1 df1.dtypes - df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), - 'B': pd.Series(np.random.randn(8)), - 'C': pd.Series(range(8), dtype='uint8')}) + df2 = pd.DataFrame( + { + 'A': pd.Series(np.random.randn(8), dtype='float16'), + 'B': pd.Series(np.random.randn(8)), + 'C': pd.Series(range(8), dtype='uint8'), + } + ) df2 df2.dtypes @@ -290,8 +294,9 @@ Furthermore ``datetime64[ns]`` columns are created by default, when passed datet .. ipython:: python - df = pd.DataFrame(np.random.randn(6, 2), pd.date_range('20010102', periods=6), - columns=['A', ' B']) + df = pd.DataFrame( + np.random.randn(6, 2), pd.date_range('20010102', periods=6), columns=['A', ' B'] + ) df['timestamp'] = pd.Timestamp('20010103') df @@ -307,6 +312,7 @@ Astype conversion on ``datetime64[ns]`` to ``object``, implicitly converts ``NaT .. ipython:: python import datetime + s = pd.Series([datetime.datetime(2001, 1, 2, 0, 0) for i in range(3)]) s.dtype s[1] = np.nan diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index c805758f85b35..6e73811fcc986 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -203,6 +203,7 @@ IO enhancements .. ipython:: python import io + df = pd.DataFrame({"a": range(3), "b": list("abc")}) print(df) html = df.to_html() diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index a624e81d17db9..041243725e85d 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -250,13 +250,12 @@ In the ``Series`` case this is effectively an appending operation s = pd.Series([1, 2, 3]) s - s[5] = 5. + s[5] = 5.0 s .. ipython:: python - dfi = pd.DataFrame(np.arange(6).reshape(3, 2), - columns=['A', 'B']) + dfi = pd.DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B']) dfi This would previously ``KeyError`` @@ -382,29 +381,30 @@ HDFStore API changes .. ipython:: python path = 'test.h5' - dfq = pd.DataFrame(np.random.randn(10, 4), - columns=list('ABCD'), - index=pd.date_range('20130101', periods=10)) + dfq = pd.DataFrame( + np.random.randn(10, 4), + columns=list('ABCD'), + index=pd.date_range('20130101', periods=10), + ) dfq.to_hdf(path, key='dfq', format='table', data_columns=True) Use boolean expressions, with in-line function evaluation. .. ipython:: python - pd.read_hdf(path, 'dfq', - where="index>Timestamp('20130104') & columns=['A', 'B']") + pd.read_hdf(path, 'dfq', where="index>Timestamp('20130104') & columns=['A', 'B']") Use an inline column reference .. ipython:: python - pd.read_hdf(path, 'dfq', - where="A>0 or C>0") + pd.read_hdf(path, 'dfq', where="A>0 or C>0") .. ipython:: python :suppress: import os + os.remove(path) - the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)`` @@ -425,6 +425,7 @@ HDFStore API changes :suppress: import os + os.remove(path) - Significant table writing performance improvements @@ -460,6 +461,7 @@ HDFStore API changes :suppress: import os + os.remove(path) - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving @@ -539,8 +541,10 @@ Enhancements .. ipython:: python import datetime + td = pd.Series(pd.date_range('20130101', periods=4)) - pd.Series( - pd.date_range('20121201', periods=4)) + pd.date_range('20121201', periods=4) + ) td[2] += np.timedelta64(datetime.timedelta(minutes=5, seconds=3)) td[3] = np.nan td @@ -593,6 +597,7 @@ Enhancements .. ipython:: python from pandas import offsets + td + offsets.Minute(5) + offsets.Milli(5) Fillna is now supported for timedeltas @@ -607,7 +612,7 @@ Enhancements .. ipython:: python td.mean() - td.quantile(.1) + td.quantile(0.1) - ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set @@ -643,16 +648,14 @@ Enhancements .. ipython:: python :okwarning: - pd.Series(['a1', 'b2', 'c3']).str.extract( - '(?P[ab])(?P\\d)') + pd.Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\\d)') and optional groups can also be used. .. ipython:: python :okwarning: - pd.Series(['a1', 'b2', '3']).str.extract( - '(?P[ab])?(?P\\d)') + pd.Series(['a1', 'b2', '3']).str.extract('(?P[ab])?(?P\\d)') - ``read_stata`` now accepts Stata 13 format (:issue:`4291`) @@ -714,6 +717,7 @@ Enhancements # note that pandas.rpy was deprecated in v0.16.0 import pandas.rpy.common as com + com.load_data('Titanic') - ``tz_localize`` can infer a fall daylight savings transition based on the structure @@ -738,8 +742,9 @@ Enhancements .. ipython:: python - df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], - 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df = pd.DataFrame( + {'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], 'B': [0.25, np.nan, np.nan, 4, 12.2, 14.4]} + ) df.interpolate() Additionally, the ``method`` argument to ``interpolate`` has been expanded @@ -761,12 +766,15 @@ Enhancements .. ipython:: python np.random.seed(123) - df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, - "A1980" : {0 : "d", 1 : "e", 2 : "f"}, - "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, - "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, - "X" : dict(zip(range(3), np.random.randn(3))) - }) + df = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), np.random.randn(3))), + } + ) df["id"] = df.index df pd.wide_to_long(df, ["A", "B"], i="id", j="year") @@ -793,8 +801,7 @@ Experimental .. ipython:: python nrows, ncols = 20000, 100 - df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) - for _ in range(4)] + df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)] .. ipython:: python @@ -923,8 +930,11 @@ Experimental # Use pandas to process and reshape the dataset df2 = df.pivot(index='STATION', columns='MONTH', values='MEAN_TEMP') - df3 = pd.concat([df2.min(), df2.mean(), df2.max()], - axis=1, keys=["Min Tem", "Mean Temp", "Max Temp"]) + df3 = pd.concat( + [df2.min(), df2.mean(), df2.max()], + axis=1, + keys=["Min Tem", "Mean Temp", "Max Temp"], + ) The resulting DataFrame is:: diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 483dd15a8467a..90dea890de3ce 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -100,9 +100,7 @@ Output formatting enhancements .. ipython:: python - df = pd.DataFrame( - [pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"] - ) + df = pd.DataFrame([pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"]) df["today"] = pd.Timestamp("20130419") df["diff"] = df["today"] - df["age"] df @@ -207,9 +205,7 @@ Enhancements .. code-block:: python # Try to infer the format for the index column - df = pd.read_csv( - "foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True - ) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) - ``date_format`` and ``datetime_format`` keywords can now be specified when writing to ``excel`` files (:issue:`4133`) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 05831197936eb..700f55a8dd632 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -134,6 +134,7 @@ API changes np.random.seed(1234) from itertools import product + tuples = list(product(('a', 'b'), ('c', 'd'))) mi = pd.MultiIndex.from_tuples(tuples) df_multi = pd.DataFrame(np.random.randn(4, 2), index=mi) @@ -274,18 +275,30 @@ Display changes .. ipython:: python - dfd = pd.DataFrame(np.arange(25).reshape(-1, 5), - index=[0, 1, 2, 3, 4], - columns=[0, 1, 2, 3, 4]) + dfd = pd.DataFrame( + np.arange(25).reshape(-1, 5), index=[0, 1, 2, 3, 4], columns=[0, 1, 2, 3, 4] + ) # show dimensions since this is truncated - with pd.option_context('display.max_rows', 2, 'display.max_columns', 2, - 'display.show_dimensions', 'truncate'): + with pd.option_context( + 'display.max_rows', + 2, + 'display.max_columns', + 2, + 'display.show_dimensions', + 'truncate', + ): print(dfd) # will not show dimensions since it is not truncated - with pd.option_context('display.max_rows', 10, 'display.max_columns', 40, - 'display.show_dimensions', 'truncate'): + with pd.option_context( + 'display.max_rows', + 10, + 'display.max_columns', + 40, + 'display.show_dimensions', + 'truncate', + ): print(dfd) - Regression in the display of a MultiIndexed Series with ``display.max_rows`` is less than the @@ -446,6 +459,7 @@ connecting to. For an in-memory sqlite database: .. ipython:: python from sqlalchemy import create_engine + # Create your connection. engine = create_engine('sqlite:///:memory:') @@ -534,17 +548,22 @@ See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :is def mklbl(prefix, n): return ["%s%s" % (prefix, i) for i in range(n)] - index = pd.MultiIndex.from_product([mklbl('A', 4), - mklbl('B', 2), - mklbl('C', 4), - mklbl('D', 2)]) - columns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = pd.DataFrame(np.arange(len(index) * len(columns)).reshape((len(index), - len(columns))), - index=index, - columns=columns).sort_index().sort_index(axis=1) + + index = pd.MultiIndex.from_product( + [mklbl('A', 4), mklbl('B', 2), mklbl('C', 4), mklbl('D', 2)] + ) + columns = pd.MultiIndex.from_tuples( + [('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1'] + ) + df = ( + pd.DataFrame( + np.arange(len(index) * len(columns)).reshape((len(index), len(columns))), + index=index, + columns=columns, + ) + .sort_index() + .sort_index(axis=1) + ) df Basic MultiIndex slicing using slices, lists, and labels. @@ -763,13 +782,16 @@ Enhancements .. ipython:: python - pd.Series({('a', 'b'): 1, ('a', 'a'): 0, - ('a', 'c'): 2, ('b', 'a'): 3, ('b', 'b'): 4}) - pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, - ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, - ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, - ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, - ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + pd.Series({('a', 'b'): 1, ('a', 'a'): 0, ('a', 'c'): 2, ('b', 'a'): 3, ('b', 'b'): 4}) + pd.DataFrame( + { + ('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, + ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, + ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, + ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, + ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}, + } + ) - Added the ``sym_diff`` method to ``Index`` (:issue:`5543`) - ``DataFrame.to_latex`` now takes a longtable keyword, which if True will return a table in a longtable environment. (:issue:`6617`) @@ -782,32 +804,40 @@ Enhancements .. ipython:: python - household = pd.DataFrame({'household_id': [1, 2, 3], - 'male': [0, 1, 0], - 'wealth': [196087.3, 316478.7, 294750] - }, - columns=['household_id', 'male', 'wealth'] - ).set_index('household_id') + household = pd.DataFrame( + { + 'household_id': [1, 2, 3], + 'male': [0, 1, 0], + 'wealth': [196087.3, 316478.7, 294750], + }, + columns=['household_id', 'male', 'wealth'], + ).set_index('household_id') household - portfolio = pd.DataFrame({'household_id': [1, 2, 2, 3, 3, 3, 4], - 'asset_id': ["nl0000301109", - "nl0000289783", - "gb00b03mlx29", - "gb00b03mlx29", - "lu0197800237", - "nl0000289965", - np.nan], - 'name': ["ABN Amro", - "Robeco", - "Royal Dutch Shell", - "Royal Dutch Shell", - "AAB Eastern Europe Equity Fund", - "Postbank BioTech Fonds", - np.nan], - 'share': [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0] - }, - columns=['household_id', 'asset_id', 'name', 'share'] - ).set_index(['household_id', 'asset_id']) + portfolio = pd.DataFrame( + { + 'household_id': [1, 2, 2, 3, 3, 3, 4], + 'asset_id': [ + "nl0000301109", + "nl0000289783", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + np.nan, + ], + 'name': [ + "ABN Amro", + "Robeco", + "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", + np.nan, + ], + 'share': [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + }, + columns=['household_id', 'asset_id', 'name', 'share'], + ).set_index(['household_id', 'asset_id']) portfolio household.join(portfolio, how='inner') @@ -842,22 +872,30 @@ Enhancements .. ipython:: python import datetime - df = pd.DataFrame({ - 'Branch': 'A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1], - 'Date': [datetime.datetime(2013, 11, 1, 13, 0), - datetime.datetime(2013, 9, 1, 13, 5), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 11, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0)], - 'PayDay': [datetime.datetime(2013, 10, 4, 0, 0), - datetime.datetime(2013, 10, 15, 13, 5), - datetime.datetime(2013, 9, 5, 20, 0), - datetime.datetime(2013, 11, 2, 10, 0), - datetime.datetime(2013, 10, 7, 20, 0), - datetime.datetime(2013, 9, 5, 10, 0)]}) + + df = pd.DataFrame( + { + 'Branch': 'A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1], + 'Date': [ + datetime.datetime(2013, 11, 1, 13, 0), + datetime.datetime(2013, 9, 1, 13, 5), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 11, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + ], + 'PayDay': [ + datetime.datetime(2013, 10, 4, 0, 0), + datetime.datetime(2013, 10, 15, 13, 5), + datetime.datetime(2013, 9, 5, 20, 0), + datetime.datetime(2013, 11, 2, 10, 0), + datetime.datetime(2013, 10, 7, 20, 0), + datetime.datetime(2013, 9, 5, 10, 0), + ], + } + ) df .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index a8f8955c3c1b9..78fd182ea86c3 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -124,9 +124,7 @@ Enhancements .. ipython:: python - rng = pd.date_range( - "3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London" - ) + rng = pd.date_range("3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London") rng.tz See :ref:`the docs `. diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 8dafed1efee97..95359adffc479 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -71,8 +71,9 @@ For full docs, see the :ref:`categorical introduction ` and the .. ipython:: python - df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], - "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) + df = pd.DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']} + ) df["grade"] = df["raw_grade"].astype("category") df["grade"] @@ -81,8 +82,9 @@ For full docs, see the :ref:`categorical introduction ` and the df["grade"] = df["grade"].cat.rename_categories(["very good", "good", "very bad"]) # Reorder the categories and simultaneously add the missing categories - df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", - "medium", "good", "very good"]) + df["grade"] = df["grade"].cat.set_categories( + ["very bad", "bad", "medium", "good", "very good"] + ) df["grade"] df.sort_values("grade") df.groupby("grade", observed=False).size() @@ -176,9 +178,14 @@ Construct a ``TimedeltaIndex`` .. ipython:: python - pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', - np.timedelta64(2, 'D'), - datetime.timedelta(days=2, seconds=2)]) + pd.TimedeltaIndex( + [ + '1 days', + '1 days, 00:00:05', + np.timedelta64(2, 'D'), + datetime.timedelta(days=2, seconds=2), + ] + ) Constructing a ``TimedeltaIndex`` with a regular range @@ -213,8 +220,7 @@ You can now use a ``TimedeltaIndex`` as the index of a pandas object .. ipython:: python - s = pd.Series(np.arange(5), - index=pd.timedelta_range('1 days', periods=5, freq='s')) + s = pd.Series(np.arange(5), index=pd.timedelta_range('1 days', periods=5, freq='s')) s You can select with partial string selections @@ -250,8 +256,15 @@ A new display option ``display.memory_usage`` (see :ref:`options`) sets the defa .. ipython:: python - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + 'int64', + 'float64', + 'datetime64[ns]', + 'timedelta64[ns]', + 'complex128', + 'object', + 'bool', + ] n = 5000 data = {t: np.random.randint(100, size=n).astype(t) for t in dtypes} df = pd.DataFrame(data) @@ -500,7 +513,7 @@ Rolling/expanding moments improvements .. ipython:: python - s = pd.Series([1, None, None, None, 2, 3]) + s = pd.Series([1, None, None, None, 2, 3]) .. code-block:: ipython @@ -595,7 +608,7 @@ Rolling/expanding moments improvements .. ipython:: python - s = pd.Series([1., 2., 0., 4.]) + s = pd.Series([1.0, 2.0, 0.0, 4.0]) .. code-block:: ipython @@ -683,7 +696,7 @@ for more details): .. code-block:: python - pd.Categorical([0,1,0,2,1], levels=['a', 'b', 'c']) + pd.Categorical([0, 1, 0, 2, 1], levels=['a', 'b', 'c']) will have to adapted to the following to keep the same behaviour: @@ -772,11 +785,12 @@ Other notable API changes: .. ipython:: python :okexcept: - s = pd.Series(np.arange(3, dtype='int64'), - index=pd.MultiIndex.from_product([['A'], - ['foo', 'bar', 'baz']], - names=['one', 'two']) - ).sort_index() + s = pd.Series( + np.arange(3, dtype='int64'), + index=pd.MultiIndex.from_product( + [['A'], ['foo', 'bar', 'baz']], names=['one', 'two'] + ), + ).sort_index() s try: s.loc[['D']] @@ -791,7 +805,7 @@ Other notable API changes: .. ipython:: python - s = pd.Series([1., 2., 3.]) + s = pd.Series([1.0, 2.0, 3.0]) s.loc[0] = None s @@ -903,8 +917,7 @@ Other notable API changes: .. ipython:: python - df = pd.DataFrame([[True, 1], [False, 2]], - columns=["female", "fitness"]) + df = pd.DataFrame([[True, 1], [False, 2]], columns=["female", "fitness"]) df df.dtypes @@ -1027,10 +1040,14 @@ Other: .. ipython:: python - df = pd.DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, - 'catB': ['a', 'b', 'c', 'd'] * 6, - 'numC': np.arange(24), - 'numD': np.arange(24.) + .5}) + df = pd.DataFrame( + { + 'catA': ['foo', 'foo', 'bar'] * 8, + 'catB': ['a', 'b', 'c', 'd'] * 6, + 'numC': np.arange(24), + 'numD': np.arange(24.0) + 0.5, + } + ) df.describe(include=["object"]) df.describe(include=["number", "object"], exclude=["float"]) @@ -1050,8 +1067,7 @@ Other: .. ipython:: python - df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], - 'C': [1, 2, 3]}) + df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], 'C': [1, 2, 3]}) pd.get_dummies(df) - ``PeriodIndex`` supports ``resolution`` as the same as ``DatetimeIndex`` (:issue:`7708`) @@ -1118,8 +1134,9 @@ Other: .. ipython:: python - idx = pd.MultiIndex.from_product([['a'], range(3), list("pqr")], - names=['foo', 'bar', 'baz']) + idx = pd.MultiIndex.from_product( + [['a'], range(3), list("pqr")], names=['foo', 'bar', 'baz'] + ) idx.set_names('qux', level=0) idx.set_names(['qux', 'corge'], level=[0, 1]) idx.set_levels(['a', 'b', 'c'], level='bar') diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index acc5409b86d09..251f620537f8f 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -186,6 +186,7 @@ Other enhancements: .. code-block:: python from sqlalchemy.types import String + data.to_sql('data_dtype', engine, dtype={'Col_1': String}) # noqa F821 - ``Series.all`` and ``Series.any`` now support the ``level`` and ``skipna`` parameters (:issue:`8302`): diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index d53ea095bb96c..3b8d84af91158 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -63,8 +63,7 @@ a function to be evaluated. .. ipython:: python - iris.assign(sepal_ratio=lambda x: (x['SepalWidth'] - / x['SepalLength'])).head() + iris.assign(sepal_ratio=lambda x: (x['SepalWidth'] / x['SepalLength'])).head() The power of ``assign`` comes when used in chains of operations. For example, we can limit the DataFrame to just those with a Sepal Length greater than 5, @@ -73,10 +72,14 @@ calculate the ratio, and plot .. ipython:: python iris = pd.read_csv('data/iris.data') - (iris.query('SepalLength > 5') - .assign(SepalRatio=lambda x: x.SepalWidth / x.SepalLength, - PetalRatio=lambda x: x.PetalWidth / x.PetalLength) - .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) + ( + iris.query('SepalLength > 5') + .assign( + SepalRatio=lambda x: x.SepalWidth / x.SepalLength, + PetalRatio=lambda x: x.PetalWidth / x.PetalLength, + ) + .plot(kind='scatter', x='SepalRatio', y='PetalRatio') + ) .. image:: ../_static/whatsnew_assign.png :scale: 50 % @@ -94,13 +97,17 @@ Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:is .. code-block:: python s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) + s.index = pd.MultiIndex.from_tuples( + [ + (1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1), + ], + names=['A', 'B', 'C', 'D'], + ) s @@ -108,9 +115,9 @@ Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:is ss = s.to_sparse() ss - A, rows, columns = ss.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=False) + A, rows, columns = ss.to_coo( + row_levels=['A', 'B'], column_levels=['C', 'D'], sort_labels=False + ) A A.todense() @@ -123,8 +130,8 @@ from a ``scipy.sparse.coo_matrix``: .. code-block:: python from scipy import sparse - A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) + + A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)) A A.todense() @@ -268,9 +275,11 @@ The behavior of a small sub-set of edge cases for using ``.loc`` have changed (: .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 4), - columns=list('ABCD'), - index=pd.date_range('20130101', periods=5)) + df = pd.DataFrame( + np.random.randn(5, 4), + columns=list('ABCD'), + index=pd.date_range('20130101', periods=5), + ) df s = pd.Series(range(5), [-2, -1, 1, 2, 3]) s @@ -318,7 +327,7 @@ The behavior of a small sub-set of edge cases for using ``.loc`` have changed (: .. code-block:: python - In [4]: df.loc[2:3] + In[4]: df.loc[2:3] KeyError: 'start bound [2] is not the [index]' New behavior @@ -676,8 +685,7 @@ Bug fixes .. ipython:: python - df1 = pd.DataFrame({'x': pd.Series(['a', 'b', 'c']), - 'y': pd.Series(['d', 'e', 'f'])}) + df1 = pd.DataFrame({'x': pd.Series(['a', 'b', 'c']), 'y': pd.Series(['d', 'e', 'f'])}) df2 = df1[['x']] df2['y'] = ['g', 'h', 'i'] diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index fb71ec60a22f0..4a8e0b7eb462e 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -783,9 +783,7 @@ Previous behavior: .. ipython:: python - df_with_missing = pd.DataFrame( - {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} - ) + df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) df_with_missing diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 569197fe9daf5..8f9d9d2d838e4 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -117,8 +117,7 @@ And multiple aggregations .. ipython:: python - r.agg({'A': ['mean', 'std'], - 'B': ['mean', 'std']}) + r.agg({'A': ['mean', 'std'], 'B': ['mean', 'std']}) .. _whatsnew_0180.enhancements.rename: @@ -136,8 +135,7 @@ argument for altering the Series or axis *name*, in addition to their old behavi .. ipython:: python df = pd.DataFrame(np.random.randn(5, 2)) - (df.rename_axis("indexname") - .rename_axis("columns_name", axis="columns")) + (df.rename_axis("indexname").rename_axis("columns_name", axis="columns")) The new functionality works well in method chains. Previously these methods only accepted functions or dicts mapping a *label* to a new label. This continues to work as before for function or dict-like values. @@ -373,7 +371,7 @@ New behavior: .. ipython:: python - s = pd.Series([1, 2, 3], index=np.arange(3.)) + s = pd.Series([1, 2, 3], index=np.arange(3.0)) s s.index print(s.to_csv(path_or_buf=None, header=False)) @@ -410,8 +408,7 @@ New behavior: .. ipython:: python - df = pd.DataFrame({'a': [0, 1, 1], - 'b': pd.Series([100, 200, 300], dtype='uint32')}) + df = pd.DataFrame({'a': [0, 1, 1], 'b': pd.Series([100, 200, 300], dtype='uint32')}) df.dtypes ix = df['a'] == 1 df.loc[ix, 'b'] = df.loc[ix, 'b'] @@ -447,11 +444,13 @@ New behavior: .. ipython:: python - df = pd.DataFrame(np.array(range(1,10)).reshape(3,3), - columns=list('abc'), - index=[[4,4,8], [8,10,12]]) + df = pd.DataFrame( + np.array(range(1, 10)).reshape(3, 3), + columns=list('abc'), + index=[[4, 4, 8], [8, 10, 12]], + ) df - df.loc[4, 'c'] = np.array([0., 1.]) + df.loc[4, 'c'] = np.array([0.0, 1.0]) df .. _whatsnew_0180.enhancements.xarray: @@ -669,10 +668,22 @@ New signature .. ipython:: python - pd.Series([0,1]).rank(axis=0, method='average', numeric_only=False, - na_option='keep', ascending=True, pct=False) - pd.DataFrame([0,1]).rank(axis=0, method='average', numeric_only=False, - na_option='keep', ascending=True, pct=False) + pd.Series([0, 1]).rank( + axis=0, + method='average', + numeric_only=False, + na_option='keep', + ascending=True, + pct=False, + ) + pd.DataFrame([0, 1]).rank( + axis=0, + method='average', + numeric_only=False, + na_option='keep', + ascending=True, + pct=False, + ) Bug in QuarterBegin with n=0 @@ -720,10 +731,11 @@ Like the change in the window functions API :ref:`above ` .. ipython:: python - df = pd.DataFrame({'A': np.random.randn(1000), - 'B': 'foo', - 'C': pd.date_range('20130101', periods=1000, freq='s')}) + df = pd.DataFrame( + { + 'A': np.random.randn(1000), + 'B': 'foo', + 'C': pd.date_range('20130101', periods=1000, freq='s'), + } + ) Using an explicit compression type @@ -238,6 +250,7 @@ The default is to infer the compression type from the extension (``compression=' :suppress: import os + os.remove("data.pkl.compress") os.remove("data.pkl.gz") os.remove("s1.pkl.bz2") @@ -276,13 +289,16 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr .. ipython:: python chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']] - df = pd.DataFrame({ - 'A': np.random.randint(100), - 'B': np.random.randint(100), - 'C': np.random.randint(100), - 'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100), - categories=chromosomes, - ordered=True)}) + df = pd.DataFrame( + { + 'A': np.random.randint(100), + 'B': np.random.randint(100), + 'C': np.random.randint(100), + 'chromosomes': pd.Categorical( + np.random.choice(chromosomes, 100), categories=chromosomes, ordered=True + ), + } + ) df **Previous behavior**: @@ -311,10 +327,13 @@ the data. .. ipython:: python df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) + { + 'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3), + }, + index=pd.Index(range(3), name='idx'), + ) df df.to_json(orient='table') @@ -346,8 +365,9 @@ All sparse formats are supported, but matrices that are not in :mod:`COOrdinate .. code-block:: python from scipy.sparse import csr_matrix + arr = np.random.random(size=(1000, 5)) - arr[arr < .9] = 0 + arr[arr < 0.9] = 0 sp_arr = csr_matrix(arr) sp_arr sdf = pd.SparseDataFrame(sp_arr) @@ -372,14 +392,15 @@ For example, after running the following, ``styled.xlsx`` renders as below: np.random.seed(24) df = pd.DataFrame({'A': np.linspace(1, 10, 10)}) - df = pd.concat([df, pd.DataFrame(np.random.RandomState(24).randn(10, 4), - columns=list('BCDE'))], - axis=1) + df = pd.concat( + [df, pd.DataFrame(np.random.RandomState(24).randn(10, 4), columns=list('BCDE'))], + axis=1, + ) df.iloc[0, 2] = np.nan df - styled = (df.style - .map(lambda val: 'color:red;' if val < 0 else 'color:black;') - .highlight_max()) + styled = df.style.map( + lambda val: 'color:red;' if val < 0 else 'color:black;' + ).highlight_max() styled.to_excel('styled.xlsx', engine='openpyxl') .. image:: ../_static/style-excel.png @@ -388,6 +409,7 @@ For example, after running the following, ``styled.xlsx`` renders as below: :suppress: import os + os.remove('styled.xlsx') See the :ref:`Style documentation ` for more detail. @@ -441,9 +463,9 @@ An ``IntervalIndex`` can also be used in ``Series`` and ``DataFrame`` as the ind .. ipython:: python - df = pd.DataFrame({'A': range(4), - 'B': pd.cut([0, 3, 1, 1], bins=c.categories) - }).set_index('B') + df = pd.DataFrame( + {'A': range(4), 'B': pd.cut([0, 3, 1, 1], bins=c.categories)} + ).set_index('B') df Selecting via a specific interval: @@ -719,16 +741,36 @@ data-types would yield different return types. These are now made consistent. (: .. ipython:: python # Series, returns an array of Timestamp tz-aware - pd.Series([pd.Timestamp(r'20160101', tz=r'US/Eastern'), - pd.Timestamp(r'20160101', tz=r'US/Eastern')]).unique() - pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])) + pd.Series( + [ + pd.Timestamp(r'20160101', tz=r'US/Eastern'), + pd.Timestamp(r'20160101', tz=r'US/Eastern'), + ] + ).unique() + pd.unique( + pd.Series( + [ + pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern'), + ] + ) + ) # Index, returns a DatetimeIndex - pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]).unique() - pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])) + pd.Index( + [ + pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern'), + ] + ).unique() + pd.unique( + pd.Index( + [ + pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern'), + ] + ) + ) - Categoricals @@ -770,9 +812,12 @@ Partial string indexing changes .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3]}, pd.DatetimeIndex(['2011-12-31 23:59:59', - '2012-01-01 00:00:00', - '2012-01-01 00:00:01'])) + df = pd.DataFrame( + {'a': [1, 2, 3]}, + pd.DatetimeIndex( + ['2011-12-31 23:59:59', '2012-01-01 00:00:00', '2012-01-01 00:00:01'] + ), + ) Previous behavior: .. code-block:: ipython @@ -905,10 +950,10 @@ This is *unchanged* from prior versions, but shown for illustration purposes: .. code-block:: python - In [87]: df.index.is_lexsorted() + In[87]: df.index.is_lexsorted() Out[87]: False - In [88]: df.index.is_monotonic + In[88]: df.index.is_monotonic Out[88]: False Sorting works as expected @@ -919,10 +964,10 @@ Sorting works as expected .. code-block:: python - In [90]: df.sort_index().index.is_lexsorted() + In[90]: df.sort_index().index.is_lexsorted() Out[90]: True - In [91]: df.sort_index().index.is_monotonic + In[91]: df.sort_index().index.is_monotonic Out[91]: True However, this example, which has a non-monotonic 2nd level, @@ -930,9 +975,10 @@ doesn't behave as desired. .. ipython:: python - df = pd.DataFrame({'value': [1, 2, 3, 4]}, - index=pd.MultiIndex([['a', 'b'], ['bb', 'aa']], - [[0, 0, 1, 1], [0, 1, 0, 1]])) + df = pd.DataFrame( + {'value': [1, 2, 3, 4]}, + index=pd.MultiIndex([['a', 'b'], ['bb', 'aa']], [[0, 0, 1, 1], [0, 1, 0, 1]]), + ) df Previous behavior: @@ -1041,10 +1087,11 @@ See the section on :ref:`Windowed Binary Operations ` for more .. ipython:: python np.random.seed(1234) - df = pd.DataFrame(np.random.rand(100, 2), - columns=pd.Index(['A', 'B'], name='bar'), - index=pd.date_range('20160101', - periods=100, freq='D', name='foo')) + df = pd.DataFrame( + np.random.rand(100, 2), + columns=pd.Index(['A', 'B'], name='bar'), + index=pd.date_range('20160101', periods=100, freq='D', name='foo'), + ) df.tail() Previous behavior: @@ -1111,6 +1158,7 @@ New behavior: :suppress: import os + os.remove('store.h5') .. _whatsnew_0200.api_breaking.index_order: @@ -1179,9 +1227,7 @@ is fixed that allowed this to return a ``Series`` under certain circumstance. (: .. ipython:: python - df = pd.DataFrame({'col1': [3, 4, 5], - 'col2': ['C', 'D', 'E'], - 'col3': [1, 3, 9]}) + df = pd.DataFrame({'col1': [3, 4, 5], 'col2': ['C', 'D', 'E'], 'col3': [1, 3, 9]}) df Previous behavior: @@ -1304,14 +1350,16 @@ The following are now part of this API: .. code-block:: python - ['DtypeWarning', - 'EmptyDataError', - 'OutOfBoundsDatetime', - 'ParserError', - 'ParserWarning', - 'PerformanceWarning', - 'UnsortedIndexError', - 'UnsupportedFunctionCall'] + [ + 'DtypeWarning', + 'EmptyDataError', + 'OutOfBoundsDatetime', + 'ParserError', + 'ParserWarning', + 'PerformanceWarning', + 'UnsortedIndexError', + 'UnsupportedFunctionCall', + ] .. _whatsnew_0200.privacy.testing: @@ -1368,9 +1416,7 @@ Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some example .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': [4, 5, 6]}, - index=list('abc')) + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=list('abc')) df @@ -1487,9 +1533,7 @@ This is an illustrative example: .. ipython:: python - df = pd.DataFrame({'A': [1, 1, 1, 2, 2], - 'B': range(5), - 'C': range(5)}) + df = pd.DataFrame({'A': [1, 1, 1, 2, 2], 'B': range(5), 'C': range(5)}) df Here is a typical useful syntax for computing different aggregations for different columns. This @@ -1544,10 +1588,7 @@ You can accomplish nearly the same by: .. ipython:: python - (df.groupby('A') - .agg({'B': 'sum', 'C': 'min'}) - .rename(columns={'B': 'foo', 'C': 'bar'}) - ) + (df.groupby('A').agg({'B': 'sum', 'C': 'min'}).rename(columns={'B': 'foo', 'C': 'bar'})) diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index dad69b99ee6a4..f3f04e09a2e9b 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -8,7 +8,7 @@ Version 0.21.0 (October 27, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a major release from 0.20.3 and includes a number of API changes, deprecations, new features, @@ -69,9 +69,9 @@ to native types, but not any coercive conversions. For example: .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': np.array([1, 2, 3], dtype='object'), - 'C': ['1', '2', '3']}) + df = pd.DataFrame( + {'A': [1, 2, 3], 'B': np.array([1, 2, 3], dtype='object'), 'C': ['1', '2', '3']} + ) df.dtypes df.infer_objects().dtypes @@ -126,8 +126,7 @@ For example: .. ipython:: python - df = pd.DataFrame(np.arange(8).reshape(2, 4), - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame(np.arange(8).reshape(2, 4), columns=['A', 'B', 'C', 'D']) df df.drop(['B', 'C'], axis=1) # the following is now equivalent @@ -242,23 +241,28 @@ First we set the data: .. ipython:: python import numpy as np + n = 1000 - df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', - 'Product_2', - 'Product_3' - ], n), - 'Revenue': (np.random.random(n) * 50 + 10).round(2), - 'Quantity': np.random.randint(1, 10, size=n)}) + df = pd.DataFrame( + { + 'Store': np.random.choice(['Store_1', 'Store_2'], n), + 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), + 'Revenue': (np.random.random(n) * 50 + 10).round(2), + 'Quantity': np.random.randint(1, 10, size=n), + } + ) df.head(2) Now, to find prices per store/product, we can simply do: .. ipython:: python - (df.groupby(['Store', 'Product']) - .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) - .unstack().round(2)) + ( + df.groupby(['Store', 'Product']) + .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) + .unstack() + .round(2) + ) See the :ref:`documentation ` for more. diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 808741ccf4475..706caca5166d8 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -8,7 +8,7 @@ What's new in 0.23.0 (May 15, 2018) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a major release from 0.22.0 and includes a number of API changes, @@ -52,11 +52,15 @@ A ``DataFrame`` can now be written to and subsequently read back via JSON while .. ipython:: python - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, - index=pd.Index(range(4), name='idx')) + df = pd.DataFrame( + { + 'foo': [1, 2, 3, 4], + 'bar': ['a', 'b', 'c', 'd'], + 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + 'qux': pd.Categorical(['a', 'b', 'c', 'c']), + }, + index=pd.Index(range(4), name='idx'), + ) df df.dtypes df.to_json('test.json', orient='table') @@ -80,6 +84,7 @@ Please note that the string ``index`` is not supported with the round trip forma :suppress: import os + os.remove('test.json') @@ -142,17 +147,25 @@ levels ` documentation section. left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key2': ['K0', 'K1', 'K0', 'K1']}, - index=left_index) + left = pd.DataFrame( + { + 'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1'], + }, + index=left_index, + ) right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3'], - 'key2': ['K0', 'K0', 'K0', 'K1']}, - index=right_index) + right = pd.DataFrame( + { + 'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1'], + }, + index=right_index, + ) left.merge(right, on=['key1', 'key2']) @@ -171,13 +184,13 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values .. ipython:: python # Build MultiIndex - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), - ('b', 2), ('b', 1), ('b', 1)]) + idx = pd.MultiIndex.from_tuples( + [('a', 1), ('a', 2), ('a', 2), ('b', 2), ('b', 1), ('b', 1)] + ) idx.names = ['first', 'second'] # Build DataFrame - df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, - index=idx) + df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, index=idx) df_multi # Sort by 'second' (index) and 'A' (column) @@ -254,10 +267,8 @@ number of groups. We have added a keyword ``observed`` to control this behavior, .. ipython:: python - cat1 = pd.Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = pd.Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df['C'] = ['foo', 'bar'] * 2 df @@ -279,10 +290,8 @@ For pivoting operations, this behavior is *already* controlled by the ``dropna`` .. ipython:: python - cat1 = pd.Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = pd.Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df @@ -356,8 +365,7 @@ outside the existing valid values while preserving those inside. (:issue:`16284 .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, - np.nan, 13, np.nan, np.nan]) + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) ser Fill one consecutive inside value in both directions @@ -513,6 +521,7 @@ Supplying a ``CategoricalDtype`` will make the categories in each column consist .. ipython:: python from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) cdt = CategoricalDtype(categories=list('abcd'), ordered=True) df = df.astype(cdt) @@ -638,10 +647,7 @@ New behavior (for Python >= 3.6): .. ipython:: python - pd.Series({'Income': 2000, - 'Expenses': -1500, - 'Taxes': -200, - 'Net result': 300}) + pd.Series({'Income': 2000, 'Expenses': -1500, 'Taxes': -200, 'Net result': 300}) Notice that the Series is now ordered by insertion order. This new behavior is used for all relevant pandas types (``Series``, ``DataFrame``, ``SparseSeries`` @@ -652,10 +658,9 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use .. ipython:: python - pd.Series({'Income': 2000, - 'Expenses': -1500, - 'Taxes': -200, - 'Net result': 300}).sort_index() + pd.Series( + {'Income': 2000, 'Expenses': -1500, 'Taxes': -200, 'Net result': 300} + ).sort_index() .. _whatsnew_0230.api_breaking.deprecate_panel: @@ -754,8 +759,7 @@ where a list-like (e.g. ``tuple`` or ``list`` is returned) (:issue:`16353`, :iss .. ipython:: python - df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, - columns=['A', 'B', 'C']) + df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) df Previous behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``. diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index cb12962256a55..ae4be9ebd20db 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -252,19 +252,21 @@ See the :ref:`Merge, join, and concatenate .. ipython:: python - index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) + index_left = pd.MultiIndex.from_tuples( + [('K0', 'X0'), ('K0', 'X1'), ('K1', 'X2')], names=['key', 'X'] + ) - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, index=index_left) + left = pd.DataFrame( + {'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=index_left + ) - index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), - ('K2', 'Y2'), ('K2', 'Y3')], - names=['key', 'Y']) + index_right = pd.MultiIndex.from_tuples( + [('K0', 'Y0'), ('K1', 'Y1'), ('K2', 'Y2'), ('K2', 'Y3')], names=['key', 'Y'] + ) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, index=index_right) + right = pd.DataFrame( + {'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}, index=index_right + ) left.join(right) @@ -272,8 +274,9 @@ For earlier versions this can be done using the following. .. ipython:: python - pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner').set_index(['key', 'X', 'Y']) + pd.merge(left.reset_index(), right.reset_index(), on=['key'], how='inner').set_index( + ['key', 'X', 'Y'] + ) .. _whatsnew_0240.enhancements.read_html: @@ -287,7 +290,10 @@ value. (:issue:`17054`) .. ipython:: python from io import StringIO - result = pd.read_html(StringIO(""" + + result = pd.read_html( + StringIO( + """ @@ -299,7 +305,9 @@ value. (:issue:`17054`) -
12
""")) + """ + ) + ) *Previous behavior*: @@ -328,9 +336,12 @@ convenient way to apply users' predefined styling functions, and can help reduce df = pd.DataFrame({'N': [1250, 1500, 1750], 'X': [0.25, 0.35, 0.50]}) + def format_and_align(styler): - return (styler.format({'N': '{:,}', 'X': '{:.1%}'}) - .set_properties(**{'text-align': 'right'})) + return styler.format({'N': '{:,}', 'X': '{:.1%}'}).set_properties( + **{'text-align': 'right'} + ) + df.style.pipe(format_and_align).set_caption('Summary of results.') @@ -352,8 +363,9 @@ Example: .. ipython:: python - mi = pd.MultiIndex.from_product([list('AB'), list('CD'), list('EF')], - names=['AB', 'CD', 'EF']) + mi = pd.MultiIndex.from_product( + [list('AB'), list('CD'), list('EF')], names=['AB', 'CD', 'EF'] + ) df = pd.DataFrame(list(range(len(mi))), index=mi, columns=['N']) df df.rename_axis(index={'CD': 'New'}) @@ -652,8 +664,7 @@ that the dates have been converted to UTC .. ipython:: python - pd.to_datetime(["2015-11-18 15:30:00+05:30", - "2015-11-18 16:30:00+06:30"], utc=True) + pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) .. _whatsnew_0240.api_breaking.read_csv_mixed_tz: @@ -763,8 +774,7 @@ from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays .. ipython:: python - ser = pd.Series([pd.Timestamp('2000', tz='UTC'), - pd.Timestamp('2000', tz='UTC')]) + ser = pd.Series([pd.Timestamp('2000', tz='UTC'), pd.Timestamp('2000', tz='UTC')]) *Previous behavior*: @@ -1122,8 +1132,8 @@ broadcast. (:issue:`23000`) .. ipython:: python - df + arr[[0], :] # 1 row, 2 columns - df + arr[:, [1]] # 1 column, 3 rows + df + arr[[0], :] # 1 row, 2 columns + df + arr[:, [1]] # 1 column, 3 rows .. _whatsnew_0240.api.incompatibilities: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5cf5623f73036..8e5e87c44ce65 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -41,9 +41,13 @@ output columns when applying multiple aggregation functions to specific columns .. ipython:: python - animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals = pd.DataFrame( + { + 'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0], + } + ) animals animals.groupby("kind").agg( min_height=pd.NamedAgg(column='height', aggfunc='min'), @@ -93,14 +97,11 @@ You can now provide multiple lambda functions to a list-like aggregation in .. ipython:: python - animals.groupby('kind').height.agg([ - lambda x: x.iloc[0], lambda x: x.iloc[-1] - ]) + animals.groupby('kind').height.agg([lambda x: x.iloc[0], lambda x: x.iloc[-1]]) - animals.groupby('kind').agg([ - lambda x: x.iloc[0] - x.iloc[1], - lambda x: x.iloc[0] + x.iloc[1] - ]) + animals.groupby('kind').agg( + [lambda x: x.iloc[0] - x.iloc[1], lambda x: x.iloc[0] + x.iloc[1]] + ) Previously, these raised a ``SpecificationError``. @@ -194,8 +195,7 @@ Here is a typical usecase. You have comma separated string in a column. .. ipython:: python - df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, - {'var1': 'd,e,f', 'var2': 2}]) + df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, {'var1': 'd,e,f', 'var2': 2}]) df Creating a long form ``DataFrame`` is now straightforward using chained operations @@ -301,8 +301,7 @@ would be reassigned as -1. (:issue:`19387`) .. ipython:: python :okexcept: - pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[0, -1, 1, 2, 3, 4]]) + pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]]) pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) @@ -323,6 +322,7 @@ Now every group is evaluated only a single time. df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) df + def func(group): print(group.name) return group @@ -442,7 +442,7 @@ Previously, columns that were categorical, but not the groupby key(s) would be c .. code-block:: python - In [5]: df.groupby('payload').first().col.dtype + In[5]: df.groupby('payload').first().col.dtype Out[5]: dtype('O') *New Behavior*: @@ -583,10 +583,10 @@ this would previously return ``True`` for any ``Interval`` overlapping an ``Inte .. code-block:: python - In [4]: pd.Interval(1, 2, closed='neither') in ii + In[4]: pd.Interval(1, 2, closed='neither') in ii Out[4]: True - In [5]: pd.Interval(-10, 10, closed='both') in ii + In[5]: pd.Interval(-10, 10, closed='both') in ii Out[5]: True *New behavior*: @@ -603,10 +603,10 @@ returning locations for overlapping matches. A ``KeyError`` will be raised if a .. code-block:: python - In [6]: ii.get_loc(pd.Interval(1, 5)) + In[6]: ii.get_loc(pd.Interval(1, 5)) Out[6]: array([0, 1]) - In [7]: ii.get_loc(pd.Interval(2, 6)) + In[7]: ii.get_loc(pd.Interval(2, 6)) Out[7]: array([0, 1, 2]) *New behavior*: @@ -784,7 +784,7 @@ This change applies only when pandas is running on Python>=3.6 (:issue:`27309`). data = [ {'name': 'Joe', 'state': 'NY', 'age': 18}, {'name': 'Jane', 'state': 'KY', 'age': 19, 'hobby': 'Minecraft'}, - {'name': 'Jean', 'state': 'OK', 'age': 20, 'finances': 'good'} + {'name': 'Jean', 'state': 'OK', 'age': 20, 'finances': 'good'}, ] *Previous Behavior*: diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index ce02525a69ace..85c3fd61d7759 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -178,9 +178,7 @@ types. For example, ``'kde'`` is a new option: .. code-block:: python - s = pd.Series( - np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3)) - ) + s = pd.Series(np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))) plt.figure() s.hist(density=True, alpha=0.2) s.plot(kind="kde") diff --git a/doc/source/whatsnew/v0.9.1.rst b/doc/source/whatsnew/v0.9.1.rst index cdc0671feeeb2..8f8d2c437e1fc 100644 --- a/doc/source/whatsnew/v0.9.1.rst +++ b/doc/source/whatsnew/v0.9.1.rst @@ -152,9 +152,7 @@ API changes import io - data = ('A,B,C\n' - '00001,001,5\n' - '00002,002,6') + data = 'A,B,C\n' '00001,001,5\n' '00002,002,6' pd.read_csv(io.StringIO(data), converters={'A': lambda x: x.strip()}) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 94a8ee7cd1a5d..b25645c76ae97 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -208,9 +208,9 @@ Example: .. ipython:: python - df = pd.DataFrame({'x': ['abc', None, 'def'], - 'y': [1, 2, np.nan], - 'z': [True, False, True]}) + df = pd.DataFrame( + {'x': ['abc', None, 'def'], 'y': [1, 2, np.nan], 'z': [True, False, True]} + ) df df.dtypes @@ -427,9 +427,9 @@ Extended verbose info output for :class:`~pandas.DataFrame` .. ipython:: python - df = pd.DataFrame({"int_col": [1, 2, 3], - "text_col": ["a", "b", "c"], - "float_col": [0.0, 0.1, 0.2]}) + df = pd.DataFrame( + {"int_col": [1, 2, 3], "text_col": ["a", "b", "c"], "float_col": [0.0, 0.1, 0.2]} + ) df.info(verbose=True) :meth:`pandas.array` inference changes @@ -645,8 +645,9 @@ scalar values in the result are instances of the extension dtype's scalar type. .. ipython:: python - df = pd.DataFrame({"A": ['a', 'b']}, dtype='category', - index=pd.date_range('2000', periods=2)) + df = pd.DataFrame( + {"A": ['a', 'b']}, dtype='category', index=pd.date_range('2000', periods=2) + ) df diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 37d021efddf0b..91b73e9efc5d5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -77,7 +77,7 @@ We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing t { "col1": ["a", "a", "b", "b", "a"], "col2": [1.0, 2.0, 3.0, np.nan, 5.0], - "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + "col3": [1.0, 2.0, 3.0, 4.0, 5.0], }, columns=["col1", "col2", "col3"], ) @@ -160,8 +160,7 @@ When applied to a ``DataFrame``, they key is applied per-column to all columns o .. ipython:: python - df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'], - 'b': [1, 2, 3, 4, 5, 6]}) + df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'], 'b': [1, 2, 3, 4, 5, 6]}) df .. ipython:: python @@ -188,8 +187,9 @@ For example: .. ipython:: python - ts = pd.Timestamp(year=2019, month=10, day=27, hour=1, minute=30, - tz="dateutil/Europe/London", fold=1) + ts = pd.Timestamp( + year=2019, month=10, day=27, hour=1, minute=30, tz="dateutil/Europe/London", fold=1 + ) ts For more on working with fold, see :ref:`Fold subsection ` in the user guide. @@ -205,8 +205,12 @@ For example: .. ipython:: python - tz_strs = ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100", - "2010-01-01 12:00:00 +0300", "2010-01-01 12:00:00 +0400"] + tz_strs = [ + "2010-01-01 12:00:00 +0100", + "2010-01-01 12:00:00 -0100", + "2010-01-01 12:00:00 +0300", + "2010-01-01 12:00:00 +0400", + ] pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) .. code-block:: ipython @@ -367,11 +371,13 @@ As an example of this, given: .. ipython:: python - df = pd.DataFrame({ - 'a': [0, 0, 0, 0], - 'b': [0, 2, 3, 4], - 'c': ['A', 'B', 'C', 'D'], - }).set_index(['a', 'b']) + df = pd.DataFrame( + { + 'a': [0, 0, 0, 0], + 'b': [0, 2, 3, 4], + 'c': ['A', 'B', 'C', 'D'], + } + ).set_index(['a', 'b']) mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) The differences in reindexing ``df`` with ``mi_2`` and using ``method='backfill'`` can be seen here: @@ -527,10 +533,8 @@ those integer keys is not present in the first level of the index (:issue:`33539 .. ipython:: python - left_df = pd.DataFrame({'animal': ['dog', 'pig'], - 'max_speed': [40, 11]}) - right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], - 'max_speed': [80, 11]}) + left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) + right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) left_df right_df @@ -659,8 +663,9 @@ the previous index (:issue:`32240`). .. ipython:: python - df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], - "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) + df = pd.DataFrame( + {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]} + ) df *Previous behavior*: @@ -694,6 +699,7 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]}) + def func(row): print(row) return row @@ -1054,8 +1060,7 @@ MultiIndex .. ipython:: python - df = pd.DataFrame(np.arange(4), - index=[["a", "a", "b", "b"], [1, 2, 1, 2]]) + df = pd.DataFrame(np.arange(4), index=[["a", "a", "b", "b"], [1, 2, 1, 2]]) # Rows are now ordered as the requested keys df.loc[(['b', 'a'], [2, 1]), :] diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 12ab4f27d1e62..6d81bb2474ecd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -356,9 +356,9 @@ of columns could result in a larger Series result. See (:issue:`37799`). .. ipython:: python :okwarning: - In [5]: df.all(bool_only=True) + In[5]: df.all(bool_only=True) - In [6]: df[["B", "C"]].all(bool_only=True) + In[6]: df[["B", "C"]].all(bool_only=True) Other DataFrame reductions with ``numeric_only=None`` will also avoid diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 91953f693190c..3bc6f355e1f94 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -154,8 +154,10 @@ respectively. Previously, negative arguments returned empty frames. .. ipython:: python - df = pd.DataFrame([["g", "g0"], ["g", "g1"], ["g", "g2"], ["g", "g3"], - ["h", "h0"], ["h", "h1"]], columns=["A", "B"]) + df = pd.DataFrame( + [["g", "g0"], ["g", "g1"], ["g", "g2"], ["g", "g3"], ["h", "h0"], ["h", "h1"]], + columns=["A", "B"], + ) df.groupby("A").head(-1) @@ -189,10 +191,8 @@ library to produce a tight representation of :class:`DataFrame` objects df = pd.DataFrame.from_records( [[1, 3], [2, 4]], - index=pd.MultiIndex.from_tuples([("a", "b"), ("a", "c")], - names=["n1", "n2"]), - columns=pd.MultiIndex.from_tuples([("x", 1), ("y", 2)], - names=["z1", "z2"]), + index=pd.MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]), + columns=pd.MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]), ) df df.to_dict(orient='tight') @@ -436,6 +436,7 @@ This inconsistency has been removed, pandas now tests up to equality. def func(x): return x.copy() + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 43aa63c284f38..25ea01f2cd57a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -39,6 +39,7 @@ initialized with a ``pyarrow.DataType``. .. ipython:: python import pyarrow as pa + ser_float = pd.Series([1.0, 2.0, None], dtype="float32[pyarrow]") ser_float @@ -177,9 +178,15 @@ Added new function :func:`~pandas.from_dummies` to convert a dummy coded :class: import pandas as pd - df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], - "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], - "col2_c": [0, 0, 1]}) + df = pd.DataFrame( + { + "col1_a": [1, 0, 1], + "col1_b": [0, 1, 0], + "col2_a": [0, 1, 0], + "col2_b": [1, 0, 0], + "col2_c": [0, 0, 1], + } + ) pd.from_dummies(df, sep="_") @@ -225,7 +232,9 @@ If the compression method cannot be inferred, use the ``compression`` argument: .. code-block:: python - df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821 + df = pd.read_csv( + some_file_obj, compression={"method": "tar", "mode": "r:gz"} + ) # noqa F821 (``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open) @@ -241,6 +250,7 @@ apply converter methods, and parse dates (:issue:`43567`). .. ipython:: python from io import StringIO + xml_dates = """ @@ -267,7 +277,7 @@ apply converter methods, and parse dates (:issue:`43567`). StringIO(xml_dates), dtype={'sides': 'Int64'}, converters={'degrees': str}, - parse_dates=['date'] + parse_dates=['date'], ) df df.dtypes diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index cacbf8452ba32..a47658c8bb0b4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -87,9 +87,8 @@ Below is a possibly non-exhaustive list of changes: .. ipython:: python from scipy import sparse - A = sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) - ) + + A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)) ser = pd.Series.sparse.from_coo(A) ser.index.dtypes @@ -154,10 +153,13 @@ When this keyword is set to ``"pyarrow"``, then these functions will return pyar .. ipython:: python import io - data = io.StringIO("""a,b,c,d,e,f,g,h,i + + data = io.StringIO( + """a,b,c,d,e,f,g,h,i 1,2.5,True,a,,,,, 3,4.5,False,b,6,7.5,True,a, - """) + """ + ) df = pd.read_csv(data, dtype_backend="pyarrow") df.dtypes diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 329ef2859f56f..7c5c7ffa4fd00 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -200,12 +200,12 @@ The :meth:`Series.case_when` function has been added to create a Series object b import pandas as pd df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6])) - default=pd.Series('default', index=df.index) + default = pd.Series('default', index=df.index) default.case_when( - caselist=[ - (df.a == 1, 'first'), # condition, replacement - (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement - ], + caselist=[ + (df.a == 1, 'first'), # condition, replacement + (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement + ], ) .. _whatsnew_220.enhancements.to_numpy_ea: @@ -256,6 +256,7 @@ DataFrame. (:issue:`54938`) .. ipython:: python import pyarrow as pa + series = pd.Series( [ {"project": "pandas", "version": "2.2.0"}, @@ -263,10 +264,12 @@ DataFrame. (:issue:`54938`) {"project": "pyarrow", "version": "13.0.0"}, ], dtype=pd.ArrowDtype( - pa.struct([ - ("project", pa.string()), - ("version", pa.string()), - ]) + pa.struct( + [ + ("project", pa.string()), + ("version", pa.string()), + ] + ) ), ) series.struct.explode() @@ -292,15 +295,14 @@ a Series. (:issue:`55323`) .. ipython:: python import pyarrow as pa + series = pd.Series( [ [1, 2, 3], [4, 5], [6], ], - dtype=pd.ArrowDtype( - pa.list_(pa.int64()) - ), + dtype=pd.ArrowDtype(pa.list_(pa.int64())), ) series.list[0] @@ -413,8 +415,12 @@ index levels when joining on two indexes with different levels (:issue:`34133`). .. ipython:: python - left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) - right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + left = pd.DataFrame( + {"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"]) + ) + right = pd.DataFrame( + {"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"]) + ) left right result = left.join(right)