From 06d72d34c6267e1a956fd3193d9d9857afe19581 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 29 May 2020 18:03:24 +0100 Subject: [PATCH 1/3] ENH: Add NDFrame.format --- doc/source/reference/frame.rst | 1 + doc/source/reference/series.rst | 1 + doc/source/whatsnew/v1.1.0.rst | 32 +++++++ pandas/core/generic.py | 15 +++ pandas/core/strings.py | 112 ++++++++++++++++++++++ pandas/tests/frame/methods/test_format.py | 62 ++++++++++++ 6 files changed, 223 insertions(+) create mode 100644 pandas/tests/frame/methods/test_format.py diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index e3dfb552651a0..0768a489c3370 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -46,6 +46,7 @@ Conversion DataFrame.astype DataFrame.convert_dtypes DataFrame.infer_objects + DataFrame.format DataFrame.copy DataFrame.bool diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3b595ba5ab206..973af498ee90c 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -48,6 +48,7 @@ Conversion Series.astype Series.convert_dtypes Series.infer_objects + Series.format Series.copy Series.bool Series.to_numpy diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 22b83425b58c2..b7145c674dbea 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -30,6 +30,38 @@ For example, the below now works: ser[0] pd.Series([1, 2, np.nan], dtype="Int64").astype("string") +.. _whatsnew_110.format: + +``DataFrame.format`` and ``Series.format`` for complex conversion to StringDtype +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +New methods :meth:`DataFrame.format` and :meth:`Series.format` have been added (:issue:`xxxxx`). +These methods allow creating a ``string`` Series from arbitrary ``Series`` or ``DataFrame`` using standard python format strings: + +.. ipython:: python + + df = pd.DataFrame({ + 'state_name': ['California', 'Texas', 'Florida'], + 'state_abbreviation': ['CA', 'TX', 'FL'], + 'population': [39_512_223, 28_995_881, 21_477_737], + }, index=[1, 2, 3]) + df + ser = df["population"] + + df.format("{state_name} ({state_abbreviation}): {population:,}") + + ser.format("Population: {population:,}") + +The output Series will always have dtype :class:`StringDtype`. + +Formatting using positional arguments is also possible (``positional_only=True`` is not necessary, but by disallowing keyword parameters performance is improved): + +.. ipython:: python + + df.format("{} ({}): {:,}", positional_only=True) + + ser.format("Population: {:,}", positional_only=True) + .. _whatsnew_110.period_index_partial_string_slicing: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb2810ba7857f..2b5f524563b57 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -97,6 +97,7 @@ from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME +from pandas.core.strings import str_format from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles @@ -105,6 +106,7 @@ if TYPE_CHECKING: from pandas.core.resample import Resampler + from pandas.core.series import Series # goal is to be able to define the docs close to function, while still being # able to share @@ -3742,6 +3744,19 @@ def __delitem__(self, key) -> None: # ---------------------------------------------------------------------- # Unsorted + @doc(str_format) + def format( + self, + format: str, + name: Optional[str] = None, + positional_only: bool = False, + how_na: str = "any", + ) -> "Series": + + return str_format( + self, format, name=name, positional_only=positional_only, how_na=how_na + ) + def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..b937b173771fc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -39,6 +39,7 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: + from pandas import Series from pandas.arrays import StringArray _cpython_optimized_encoders = ( @@ -241,6 +242,117 @@ def g(x): return lib.map_infer(arr, f) +def str_format( + arr, + format: str, + name: str = None, + positional_only: bool = False, + how_na: str = "any", +) -> "Series": + """ + Format rows according to the format and return a Series with one string per row. + + Parameters + ---------- + arr: DataFrame or Series + The values to format. + format : str + format string. + name: Label, optional + The name of the returned Series. + positional_only: bool, default False + If True, only allow positional parameters (i.e. allow "{}", but not "{key}"). + Setting to ``True`` will improve performance. + how_na: str, one of {"all", "any"}, default "any" + If "all", return ``NA`` if all values in row are nan values. + If "any", return ``NA`` if at least one of the values in row is a nan value. + + Returns + ------- + Series + A Series with dtype ``StringDtype``, formatted according to ``format``. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'state_name': ['California', 'Texas', 'Florida'], + ... 'state_abbreviation': ['CA', 'TX', 'FL'], + ... 'population': [39_512_223, 28_995_881, 21_477_737], + ... }, index=[1, 2, 3]) + >>> df + state_name state_abbreviation population + 1 California CA 39512223 + 2 Texas TX 28995881 + 3 Florida FL 21477737 + >>> ser = df["population"] + + Formatting using positional arguments: + + >>> ser.format("Population: {:,}") + 1 Population: 39,512,223 + 2 Population: 28,995,881 + 3 Population: 21,477,737 + dtype: string + + >>> df.format("{} ({}): {:,}") + 1 California (CA): 39,512,223 + 2 Texas (TX): 28,995,881 + 3 Florida (FL): 21,477,737 + dtype: string + + Using keyword arguments (only works if column labels are strings): + + >>> ser.format("Population: {population:,}") + 1 Population: 39,512,223 + 2 Population: 28,995,881 + 3 Population: 21,477,737 + dtype: string + + >>> df.format("{state_name} ({state_abbreviation}): {population:,}") + 1 California (CA): 39,512,223 + 2 Texas (TX): 28,995,881 + 3 Florida (FL): 21,477,737 + dtype: string + + The index can be added using the keyword 'Index': + + >>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})") + 1 California (CA): 39,512,223 (no. 1) + 2 Texas (TX): 28,995,881 (no. 2) + 3 Florida (FL): 21,477,737 (no. 3) + dtype: string + """ + from pandas import NA + from pandas.arrays import StringArray + + if not isinstance(arr, ABCDataFrame): + result_wrapper = arr._constructor + arr_name = arr.name if arr.name is not None else "_1" + arr = arr.to_frame(name=arr_name) + else: + result_wrapper = arr._constructor_sliced + + na_mask = isna(arr) + if how_na == "any": + na_mask = na_mask.any(axis=1) + elif how_na == "all": + na_mask = na_mask.all(axis=1) + else: + raise ValueError(how_na) + + func = format.format + if positional_only: + named_tups = arr.itertuples(index=False) + result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object) + else: + named_tups = arr.itertuples() + res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups] + result = np.array(res, dtype=object) + + result[na_mask] = NA + return result_wrapper(StringArray(result), index=arr.index.copy(), name=name) + + def str_count(arr, pat, flags=0): """ Count occurrences of pattern in each string of the Series/Index. diff --git a/pandas/tests/frame/methods/test_format.py b/pandas/tests/frame/methods/test_format.py new file mode 100644 index 0000000000000..e4164bedf837b --- /dev/null +++ b/pandas/tests/frame/methods/test_format.py @@ -0,0 +1,62 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestFormat: + @pytest.mark.parametrize("format_str", ["{}-{}", "{A}-{B}", "{}-{B}"]) + @pytest.mark.parametrize("name", [None, "X"]) + @pytest.mark.parametrize("how_na", ["all", "any"]) + def test_basic(self, format_str, name, how_na): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string", name=name) + + result = df.format(format_str, name=name, how_na=how_na) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{Index}-{}-{}", "{Index}-{A}-{B}"]) + def test_with_index(self, format_str): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.Series(["0-1-4", "1-2-5", "2-3-6"], dtype="string") + + result = df.format(format_str) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{}-{}"]) + @pytest.mark.parametrize("positional_only", [True, False]) + def test_positional_only(self, format_str, positional_only): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string") + + result = df.format(format_str, positional_only=positional_only) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{A}-{B}", "{A}-{}", "{Index}-{}"]) + def test_positional_only_raises(self, format_str): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(KeyError): + df.format(format_str, positional_only=True) + + @pytest.mark.parametrize( + "how_na, expected", + [("any", ["1-4", pd.NA, pd.NA]), ("all", ["1-4", "nan-5", pd.NA])], + ) + def test_na_how(self, how_na, expected): + df = pd.DataFrame({"A": [1, None, None], "B": [4, 5, None]}) + expected = pd.Series(expected, dtype="string") + + result = df.format("{:.0f}-{:.0f}", how_na=how_na) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_string", ["{}-{}-{}", "{0}-{1}-{2}"]) + def test_too_many_positional_args(self, format_string): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(IndexError): + df.format(format_string) + + @pytest.mark.parametrize("format_string", ["{A}-{B}-{C}", "{C}"]) + def test_too_many_named_args(self, format_string): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(KeyError): + df.format(format_string) From 59b5dc2299daa8fd05359cb28702a64af237674f Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 22 Jun 2020 23:52:33 +0100 Subject: [PATCH 2/3] linting issues etc. --- doc/source/whatsnew/v1.1.0.rst | 5 +---- pandas/core/generic.py | 2 +- pandas/core/strings.py | 24 +++++++++++++++++------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b7145c674dbea..ff34f2a6f247c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -44,12 +44,10 @@ These methods allow creating a ``string`` Series from arbitrary ``Series`` or `` 'state_name': ['California', 'Texas', 'Florida'], 'state_abbreviation': ['CA', 'TX', 'FL'], 'population': [39_512_223, 28_995_881, 21_477_737], - }, index=[1, 2, 3]) + }, index=[1, 2, 3]) df ser = df["population"] - df.format("{state_name} ({state_abbreviation}): {population:,}") - ser.format("Population: {population:,}") The output Series will always have dtype :class:`StringDtype`. @@ -59,7 +57,6 @@ Formatting using positional arguments is also possible (``positional_only=True`` .. ipython:: python df.format("{} ({}): {:,}", positional_only=True) - ser.format("Population: {:,}", positional_only=True) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2b5f524563b57..328d095d8b3cb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3749,7 +3749,7 @@ def format( self, format: str, name: Optional[str] = None, - positional_only: bool = False, + positional_only: bool_t = False, how_na: str = "any", ) -> "Series": diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b937b173771fc..b393fe66ce214 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,17 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Pattern, + Type, + Union, +) import warnings import numpy as np @@ -245,7 +255,7 @@ def g(x): def str_format( arr, format: str, - name: str = None, + name: Optional[str] = None, positional_only: bool = False, how_na: str = "any", ) -> "Series": @@ -254,16 +264,16 @@ def str_format( Parameters ---------- - arr: DataFrame or Series + arr : DataFrame or Series The values to format. format : str format string. - name: Label, optional + name : Label, optional The name of the returned Series. - positional_only: bool, default False + positional_only : bool, default False If True, only allow positional parameters (i.e. allow "{}", but not "{key}"). Setting to ``True`` will improve performance. - how_na: str, one of {"all", "any"}, default "any" + how_na : str, one of {"all", "any"}, default "any" If "all", return ``NA`` if all values in row are nan values. If "any", return ``NA`` if at least one of the values in row is a nan value. @@ -294,7 +304,7 @@ def str_format( 3 Population: 21,477,737 dtype: string - >>> df.format("{} ({}): {:,}") + >>> df.format("{} ({}): {:,}") 1 California (CA): 39,512,223 2 Texas (TX): 28,995,881 3 Florida (FL): 21,477,737 From db6d235b18ab1d013c0dfa4554ca69018dee9ae0 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 23 Jun 2020 00:31:20 +0100 Subject: [PATCH 3/3] Add tests for Series.format --- pandas/core/strings.py | 2 +- pandas/tests/series/methods/test_format.py | 64 ++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/series/methods/test_format.py diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b393fe66ce214..82ca688802dbc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -318,7 +318,7 @@ def str_format( 3 Population: 21,477,737 dtype: string - >>> df.format("{state_name} ({state_abbreviation}): {population:,}") + >>> df.format("{state_name} ({state_abbreviation}): {population:,}") 1 California (CA): 39,512,223 2 Texas (TX): 28,995,881 3 Florida (FL): 21,477,737 diff --git a/pandas/tests/series/methods/test_format.py b/pandas/tests/series/methods/test_format.py new file mode 100644 index 0000000000000..ebb68526b2e2d --- /dev/null +++ b/pandas/tests/series/methods/test_format.py @@ -0,0 +1,64 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestFormat: + @pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"]) + @pytest.mark.parametrize("how_na", ["all", "any"]) + def test_basic(self, format_str, how_na): + ser = pd.Series([1, 2, 3], name="A") + expected = pd.Series( + ["Value: 1", "Value: 2", "Value: 3"], dtype="string", name="X" + ) + + result = ser.format(format_str, how_na=how_na, name="X") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{Index}-{}", "{Index}-{A}"]) + def test_with_index(self, format_str): + ser = pd.Series([1, 2, 3], name="A") + expected = pd.Series(["0-1", "1-2", "2-3"], dtype="string", name="X") + + result = ser.format(format_str, name="X") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["Value: {}"]) + @pytest.mark.parametrize("positional_only", [True, False]) + def test_positional_only(self, format_str, positional_only): + ser = pd.Series([1, 2, 3], name="A") + expected = pd.Series(["Value: 1", "Value: 2", "Value: 3"], dtype="string") + + result = ser.format(format_str, positional_only=positional_only) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_str", ["{A}-{}", "{Index}-{}"]) + def test_positional_only_raises(self, format_str): + ser = pd.Series([1, 2, 3], name="A") + with pytest.raises(KeyError): + ser.format(format_str, positional_only=True) + + @pytest.mark.parametrize( + "how_na, expected", + [("any", ["Value: 1", pd.NA, pd.NA]), ("all", ["Value: 1", pd.NA, pd.NA])], + ) + @pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"]) + def test_na_how(self, how_na, expected, format_str): + ser = pd.Series([1, pd.NA, pd.NA], name="A") + expected = pd.Series(expected, dtype="string") + + result = ser.format("Value: {}", how_na=how_na) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format_string", ["{}-{}", "{0}-{1}"]) + def test_too_many_positional_args(self, format_string): + ser = pd.Series([1, 2, 3], name="A") + with pytest.raises(IndexError): + ser.format(format_string) + + @pytest.mark.parametrize("format_string", ["{A}-{B}", "{B}"]) + def test_unknown_named_args(self, format_string): + ser = pd.Series([1, 2, 3], name="A") + with pytest.raises(KeyError): + ser.format(format_string)