ENH: Add NDFrame.format

topper-123 · topper-123 · commit 06d72d34c626 · 2020-06-22T22:51:44.000+01:00
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -46,6 +46,7 @@ Conversion
    DataFrame.astype
    DataFrame.convert_dtypes
    DataFrame.infer_objects
+   DataFrame.format
    DataFrame.copy
    DataFrame.bool
 
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -48,6 +48,7 @@ Conversion
    Series.astype
    Series.convert_dtypes
    Series.infer_objects
+   Series.format
    Series.copy
    Series.bool
    Series.to_numpy
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -30,6 +30,38 @@ For example, the below now works:
    ser[0]
    pd.Series([1, 2, np.nan], dtype="Int64").astype("string")
 
+.. _whatsnew_110.format:
+
+``DataFrame.format`` and ``Series.format`` for complex conversion to StringDtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+New methods :meth:`DataFrame.format` and :meth:`Series.format` have been added (:issue:`xxxxx`).
+These methods allow creating a ``string`` Series from arbitrary ``Series`` or ``DataFrame`` using standard python format strings:
+
+.. ipython:: python
+
+    df = pd.DataFrame({
+        'state_name': ['California', 'Texas', 'Florida'],
+        'state_abbreviation': ['CA', 'TX', 'FL'],
+        'population': [39_512_223, 28_995_881, 21_477_737],
+        }, index=[1, 2, 3])
+    df
+    ser = df["population"]
+
+    df.format("{state_name} ({state_abbreviation}): {population:,}")
+
+    ser.format("Population: {population:,}")
+
+The output Series will always have dtype :class:`StringDtype`.
+
+Formatting using positional arguments is also possible (``positional_only=True`` is not necessary, but by disallowing keyword parameters performance is improved):
+
+.. ipython:: python
+
+    df.format("{} ({}): {:,}", positional_only=True)
+
+    ser.format("Population: {:,}", positional_only=True)
+
 
 .. _whatsnew_110.period_index_partial_string_slicing:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -97,6 +97,7 @@
 from pandas.core.internals import BlockManager
 from pandas.core.missing import find_valid_index
 from pandas.core.ops import _align_method_FRAME
+from pandas.core.strings import str_format
 
 from pandas.io.formats import format as fmt
 from pandas.io.formats.format import DataFrameFormatter, format_percentiles
@@ -105,6 +106,7 @@
 
 if TYPE_CHECKING:
     from pandas.core.resample import Resampler
+    from pandas.core.series import Series
 
 # goal is to be able to define the docs close to function, while still being
 # able to share
@@ -3742,6 +3744,19 @@ def __delitem__(self, key) -> None:
     # ----------------------------------------------------------------------
     # Unsorted
 
+    @doc(str_format)
+    def format(
+        self,
+        format: str,
+        name: Optional[str] = None,
+        positional_only: bool = False,
+        how_na: str = "any",
+    ) -> "Series":
+
+        return str_format(
+            self, format, name=name, positional_only=positional_only, how_na=how_na
+        )
+
     def get(self, key, default=None):
         """
         Get item from object for given key (ex: DataFrame column).
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -39,6 +39,7 @@
 from pandas.core.construction import extract_array
 
 if TYPE_CHECKING:
+    from pandas import Series
     from pandas.arrays import StringArray
 
 _cpython_optimized_encoders = (
@@ -241,6 +242,117 @@ def g(x):
         return lib.map_infer(arr, f)
 
 
+def str_format(
+    arr,
+    format: str,
+    name: str = None,
+    positional_only: bool = False,
+    how_na: str = "any",
+) -> "Series":
+    """
+    Format rows according to the format and return a Series with one string per row.
+
+    Parameters
+    ----------
+    arr: DataFrame or Series
+        The values to format.
+    format : str
+        format string.
+    name: Label, optional
+        The name of the returned Series.
+    positional_only: bool, default False
+        If True, only allow positional parameters (i.e. allow "{}", but not "{key}").
+        Setting to ``True`` will improve performance.
+    how_na: str, one of {"all", "any"}, default "any"
+        If "all", return ``NA`` if all values in row are nan values.
+        If "any", return ``NA`` if at least one of the values in row is a nan value.
+
+    Returns
+    -------
+    Series
+        A Series with dtype ``StringDtype``, formatted according to ``format``.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({
+    ...     'state_name': ['California', 'Texas', 'Florida'],
+    ...     'state_abbreviation': ['CA', 'TX', 'FL'],
+    ...     'population': [39_512_223, 28_995_881, 21_477_737],
+    ...     }, index=[1, 2, 3])
+    >>> df
+       state_name state_abbreviation  population
+    1  California                 CA    39512223
+    2       Texas                 TX    28995881
+    3     Florida                 FL    21477737
+    >>> ser = df["population"]
+
+    Formatting using positional arguments:
+
+    >>> ser.format("Population: {:,}")
+    1    Population: 39,512,223
+    2    Population: 28,995,881
+    3    Population: 21,477,737
+    dtype: string
+
+    >>>  df.format("{} ({}): {:,}")
+    1    California (CA): 39,512,223
+    2         Texas (TX): 28,995,881
+    3       Florida (FL): 21,477,737
+    dtype: string
+
+    Using keyword arguments (only works if column labels are strings):
+
+    >>> ser.format("Population: {population:,}")
+    1    Population: 39,512,223
+    2    Population: 28,995,881
+    3    Population: 21,477,737
+    dtype: string
+
+    >>>  df.format("{state_name} ({state_abbreviation}): {population:,}")
+    1    California (CA): 39,512,223
+    2         Texas (TX): 28,995,881
+    3       Florida (FL): 21,477,737
+    dtype: string
+
+    The index can be added using the keyword 'Index':
+
+    >>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})")
+    1    California (CA): 39,512,223 (no. 1)
+    2         Texas (TX): 28,995,881 (no. 2)
+    3       Florida (FL): 21,477,737 (no. 3)
+    dtype: string
+    """
+    from pandas import NA
+    from pandas.arrays import StringArray
+
+    if not isinstance(arr, ABCDataFrame):
+        result_wrapper = arr._constructor
+        arr_name = arr.name if arr.name is not None else "_1"
+        arr = arr.to_frame(name=arr_name)
+    else:
+        result_wrapper = arr._constructor_sliced
+
+    na_mask = isna(arr)
+    if how_na == "any":
+        na_mask = na_mask.any(axis=1)
+    elif how_na == "all":
+        na_mask = na_mask.all(axis=1)
+    else:
+        raise ValueError(how_na)
+
+    func = format.format
+    if positional_only:
+        named_tups = arr.itertuples(index=False)
+        result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object)
+    else:
+        named_tups = arr.itertuples()
+        res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups]
+        result = np.array(res, dtype=object)
+
+    result[na_mask] = NA
+    return result_wrapper(StringArray(result), index=arr.index.copy(), name=name)
+
+
 def str_count(arr, pat, flags=0):
     """
     Count occurrences of pattern in each string of the Series/Index.
diff --git a/pandas/tests/frame/methods/test_format.py b/pandas/tests/frame/methods/test_format.py
@@ -0,0 +1,62 @@
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestFormat:
+    @pytest.mark.parametrize("format_str", ["{}-{}", "{A}-{B}", "{}-{B}"])
+    @pytest.mark.parametrize("name", [None, "X"])
+    @pytest.mark.parametrize("how_na", ["all", "any"])
+    def test_basic(self, format_str, name, how_na):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string", name=name)
+
+        result = df.format(format_str, name=name, how_na=how_na)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{Index}-{}-{}", "{Index}-{A}-{B}"])
+    def test_with_index(self, format_str):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        expected = pd.Series(["0-1-4", "1-2-5", "2-3-6"], dtype="string")
+
+        result = df.format(format_str)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{}-{}"])
+    @pytest.mark.parametrize("positional_only", [True, False])
+    def test_positional_only(self, format_str, positional_only):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string")
+
+        result = df.format(format_str, positional_only=positional_only)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{A}-{B}", "{A}-{}", "{Index}-{}"])
+    def test_positional_only_raises(self, format_str):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        with pytest.raises(KeyError):
+            df.format(format_str, positional_only=True)
+
+    @pytest.mark.parametrize(
+        "how_na, expected",
+        [("any", ["1-4", pd.NA, pd.NA]), ("all", ["1-4", "nan-5", pd.NA])],
+    )
+    def test_na_how(self, how_na, expected):
+        df = pd.DataFrame({"A": [1, None, None], "B": [4, 5, None]})
+        expected = pd.Series(expected, dtype="string")
+
+        result = df.format("{:.0f}-{:.0f}", how_na=how_na)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_string", ["{}-{}-{}", "{0}-{1}-{2}"])
+    def test_too_many_positional_args(self, format_string):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        with pytest.raises(IndexError):
+            df.format(format_string)
+
+    @pytest.mark.parametrize("format_string", ["{A}-{B}-{C}", "{C}"])
+    def test_too_many_named_args(self, format_string):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        with pytest.raises(KeyError):
+            df.format(format_string)