From 06d72d34c6267e1a956fd3193d9d9857afe19581 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Fri, 29 May 2020 18:03:24 +0100
Subject: [PATCH 1/3] ENH: Add NDFrame.format

---
 doc/source/reference/frame.rst            |   1 +
 doc/source/reference/series.rst           |   1 +
 doc/source/whatsnew/v1.1.0.rst            |  32 +++++++
 pandas/core/generic.py                    |  15 +++
 pandas/core/strings.py                    | 112 ++++++++++++++++++++++
 pandas/tests/frame/methods/test_format.py |  62 ++++++++++++
 6 files changed, 223 insertions(+)
 create mode 100644 pandas/tests/frame/methods/test_format.py

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
index e3dfb552651a0..0768a489c3370 100644
--- a/doc/source/reference/frame.rst
+++ b/doc/source/reference/frame.rst
@@ -46,6 +46,7 @@ Conversion
    DataFrame.astype
    DataFrame.convert_dtypes
    DataFrame.infer_objects
+   DataFrame.format
    DataFrame.copy
    DataFrame.bool
 
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
index 3b595ba5ab206..973af498ee90c 100644
--- a/doc/source/reference/series.rst
+++ b/doc/source/reference/series.rst
@@ -48,6 +48,7 @@ Conversion
    Series.astype
    Series.convert_dtypes
    Series.infer_objects
+   Series.format
    Series.copy
    Series.bool
    Series.to_numpy
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 22b83425b58c2..b7145c674dbea 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -30,6 +30,38 @@ For example, the below now works:
    ser[0]
    pd.Series([1, 2, np.nan], dtype="Int64").astype("string")
 
+.. _whatsnew_110.format:
+
+``DataFrame.format`` and ``Series.format`` for complex conversion to StringDtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+New methods :meth:`DataFrame.format` and :meth:`Series.format` have been added (:issue:`xxxxx`).
+These methods allow creating a ``string`` Series from arbitrary ``Series`` or ``DataFrame`` using standard python format strings:
+
+.. ipython:: python
+
+    df = pd.DataFrame({
+        'state_name': ['California', 'Texas', 'Florida'],
+        'state_abbreviation': ['CA', 'TX', 'FL'],
+        'population': [39_512_223, 28_995_881, 21_477_737],
+        }, index=[1, 2, 3])
+    df
+    ser = df["population"]
+
+    df.format("{state_name} ({state_abbreviation}): {population:,}")
+
+    ser.format("Population: {population:,}")
+
+The output Series will always have dtype :class:`StringDtype`.
+
+Formatting using positional arguments is also possible (``positional_only=True`` is not necessary, but by disallowing keyword parameters performance is improved):
+
+.. ipython:: python
+
+    df.format("{} ({}): {:,}", positional_only=True)
+
+    ser.format("Population: {:,}", positional_only=True)
+
 
 .. _whatsnew_110.period_index_partial_string_slicing:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index bb2810ba7857f..2b5f524563b57 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -97,6 +97,7 @@
 from pandas.core.internals import BlockManager
 from pandas.core.missing import find_valid_index
 from pandas.core.ops import _align_method_FRAME
+from pandas.core.strings import str_format
 
 from pandas.io.formats import format as fmt
 from pandas.io.formats.format import DataFrameFormatter, format_percentiles
@@ -105,6 +106,7 @@
 
 if TYPE_CHECKING:
     from pandas.core.resample import Resampler
+    from pandas.core.series import Series
 
 # goal is to be able to define the docs close to function, while still being
 # able to share
@@ -3742,6 +3744,19 @@ def __delitem__(self, key) -> None:
     # ----------------------------------------------------------------------
     # Unsorted
 
+    @doc(str_format)
+    def format(
+        self,
+        format: str,
+        name: Optional[str] = None,
+        positional_only: bool = False,
+        how_na: str = "any",
+    ) -> "Series":
+
+        return str_format(
+            self, format, name=name, positional_only=positional_only, how_na=how_na
+        )
+
     def get(self, key, default=None):
         """
         Get item from object for given key (ex: DataFrame column).
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index a1db7742916de..b937b173771fc 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -39,6 +39,7 @@
 from pandas.core.construction import extract_array
 
 if TYPE_CHECKING:
+    from pandas import Series
     from pandas.arrays import StringArray
 
 _cpython_optimized_encoders = (
@@ -241,6 +242,117 @@ def g(x):
         return lib.map_infer(arr, f)
 
 
+def str_format(
+    arr,
+    format: str,
+    name: str = None,
+    positional_only: bool = False,
+    how_na: str = "any",
+) -> "Series":
+    """
+    Format rows according to the format and return a Series with one string per row.
+
+    Parameters
+    ----------
+    arr: DataFrame or Series
+        The values to format.
+    format : str
+        format string.
+    name: Label, optional
+        The name of the returned Series.
+    positional_only: bool, default False
+        If True, only allow positional parameters (i.e. allow "{}", but not "{key}").
+        Setting to ``True`` will improve performance.
+    how_na: str, one of {"all", "any"}, default "any"
+        If "all", return ``NA`` if all values in row are nan values.
+        If "any", return ``NA`` if at least one of the values in row is a nan value.
+
+    Returns
+    -------
+    Series
+        A Series with dtype ``StringDtype``, formatted according to ``format``.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({
+    ...     'state_name': ['California', 'Texas', 'Florida'],
+    ...     'state_abbreviation': ['CA', 'TX', 'FL'],
+    ...     'population': [39_512_223, 28_995_881, 21_477_737],
+    ...     }, index=[1, 2, 3])
+    >>> df
+       state_name state_abbreviation  population
+    1  California                 CA    39512223
+    2       Texas                 TX    28995881
+    3     Florida                 FL    21477737
+    >>> ser = df["population"]
+
+    Formatting using positional arguments:
+
+    >>> ser.format("Population: {:,}")
+    1    Population: 39,512,223
+    2    Population: 28,995,881
+    3    Population: 21,477,737
+    dtype: string
+
+    >>>  df.format("{} ({}): {:,}")
+    1    California (CA): 39,512,223
+    2         Texas (TX): 28,995,881
+    3       Florida (FL): 21,477,737
+    dtype: string
+
+    Using keyword arguments (only works if column labels are strings):
+
+    >>> ser.format("Population: {population:,}")
+    1    Population: 39,512,223
+    2    Population: 28,995,881
+    3    Population: 21,477,737
+    dtype: string
+
+    >>>  df.format("{state_name} ({state_abbreviation}): {population:,}")
+    1    California (CA): 39,512,223
+    2         Texas (TX): 28,995,881
+    3       Florida (FL): 21,477,737
+    dtype: string
+
+    The index can be added using the keyword 'Index':
+
+    >>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})")
+    1    California (CA): 39,512,223 (no. 1)
+    2         Texas (TX): 28,995,881 (no. 2)
+    3       Florida (FL): 21,477,737 (no. 3)
+    dtype: string
+    """
+    from pandas import NA
+    from pandas.arrays import StringArray
+
+    if not isinstance(arr, ABCDataFrame):
+        result_wrapper = arr._constructor
+        arr_name = arr.name if arr.name is not None else "_1"
+        arr = arr.to_frame(name=arr_name)
+    else:
+        result_wrapper = arr._constructor_sliced
+
+    na_mask = isna(arr)
+    if how_na == "any":
+        na_mask = na_mask.any(axis=1)
+    elif how_na == "all":
+        na_mask = na_mask.all(axis=1)
+    else:
+        raise ValueError(how_na)
+
+    func = format.format
+    if positional_only:
+        named_tups = arr.itertuples(index=False)
+        result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object)
+    else:
+        named_tups = arr.itertuples()
+        res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups]
+        result = np.array(res, dtype=object)
+
+    result[na_mask] = NA
+    return result_wrapper(StringArray(result), index=arr.index.copy(), name=name)
+
+
 def str_count(arr, pat, flags=0):
     """
     Count occurrences of pattern in each string of the Series/Index.
diff --git a/pandas/tests/frame/methods/test_format.py b/pandas/tests/frame/methods/test_format.py
new file mode 100644
index 0000000000000..e4164bedf837b
--- /dev/null
+++ b/pandas/tests/frame/methods/test_format.py
@@ -0,0 +1,62 @@
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestFormat:
+    @pytest.mark.parametrize("format_str", ["{}-{}", "{A}-{B}", "{}-{B}"])
+    @pytest.mark.parametrize("name", [None, "X"])
+    @pytest.mark.parametrize("how_na", ["all", "any"])
+    def test_basic(self, format_str, name, how_na):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string", name=name)
+
+        result = df.format(format_str, name=name, how_na=how_na)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{Index}-{}-{}", "{Index}-{A}-{B}"])
+    def test_with_index(self, format_str):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        expected = pd.Series(["0-1-4", "1-2-5", "2-3-6"], dtype="string")
+
+        result = df.format(format_str)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{}-{}"])
+    @pytest.mark.parametrize("positional_only", [True, False])
+    def test_positional_only(self, format_str, positional_only):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string")
+
+        result = df.format(format_str, positional_only=positional_only)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{A}-{B}", "{A}-{}", "{Index}-{}"])
+    def test_positional_only_raises(self, format_str):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        with pytest.raises(KeyError):
+            df.format(format_str, positional_only=True)
+
+    @pytest.mark.parametrize(
+        "how_na, expected",
+        [("any", ["1-4", pd.NA, pd.NA]), ("all", ["1-4", "nan-5", pd.NA])],
+    )
+    def test_na_how(self, how_na, expected):
+        df = pd.DataFrame({"A": [1, None, None], "B": [4, 5, None]})
+        expected = pd.Series(expected, dtype="string")
+
+        result = df.format("{:.0f}-{:.0f}", how_na=how_na)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_string", ["{}-{}-{}", "{0}-{1}-{2}"])
+    def test_too_many_positional_args(self, format_string):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        with pytest.raises(IndexError):
+            df.format(format_string)
+
+    @pytest.mark.parametrize("format_string", ["{A}-{B}-{C}", "{C}"])
+    def test_too_many_named_args(self, format_string):
+        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        with pytest.raises(KeyError):
+            df.format(format_string)

From 59b5dc2299daa8fd05359cb28702a64af237674f Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Mon, 22 Jun 2020 23:52:33 +0100
Subject: [PATCH 2/3] linting issues etc.

---
 doc/source/whatsnew/v1.1.0.rst |  5 +----
 pandas/core/generic.py         |  2 +-
 pandas/core/strings.py         | 24 +++++++++++++++++-------
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index b7145c674dbea..ff34f2a6f247c 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -44,12 +44,10 @@ These methods allow creating a ``string`` Series from arbitrary ``Series`` or ``
         'state_name': ['California', 'Texas', 'Florida'],
         'state_abbreviation': ['CA', 'TX', 'FL'],
         'population': [39_512_223, 28_995_881, 21_477_737],
-        }, index=[1, 2, 3])
+    }, index=[1, 2, 3])
     df
     ser = df["population"]
-
     df.format("{state_name} ({state_abbreviation}): {population:,}")
-
     ser.format("Population: {population:,}")
 
 The output Series will always have dtype :class:`StringDtype`.
@@ -59,7 +57,6 @@ Formatting using positional arguments is also possible (``positional_only=True``
 .. ipython:: python
 
     df.format("{} ({}): {:,}", positional_only=True)
-
     ser.format("Population: {:,}", positional_only=True)
 
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 2b5f524563b57..328d095d8b3cb 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3749,7 +3749,7 @@ def format(
         self,
         format: str,
         name: Optional[str] = None,
-        positional_only: bool = False,
+        positional_only: bool_t = False,
         how_na: str = "any",
     ) -> "Series":
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index b937b173771fc..b393fe66ce214 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2,7 +2,17 @@
 from functools import wraps
 import re
 import textwrap
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Pattern,
+    Type,
+    Union,
+)
 import warnings
 
 import numpy as np
@@ -245,7 +255,7 @@ def g(x):
 def str_format(
     arr,
     format: str,
-    name: str = None,
+    name: Optional[str] = None,
     positional_only: bool = False,
     how_na: str = "any",
 ) -> "Series":
@@ -254,16 +264,16 @@ def str_format(
 
     Parameters
     ----------
-    arr: DataFrame or Series
+    arr : DataFrame or Series
         The values to format.
     format : str
         format string.
-    name: Label, optional
+    name : Label, optional
         The name of the returned Series.
-    positional_only: bool, default False
+    positional_only : bool, default False
         If True, only allow positional parameters (i.e. allow "{}", but not "{key}").
         Setting to ``True`` will improve performance.
-    how_na: str, one of {"all", "any"}, default "any"
+    how_na : str, one of {"all", "any"}, default "any"
         If "all", return ``NA`` if all values in row are nan values.
         If "any", return ``NA`` if at least one of the values in row is a nan value.
 
@@ -294,7 +304,7 @@ def str_format(
     3    Population: 21,477,737
     dtype: string
 
-    >>>  df.format("{} ({}): {:,}")
+    >>> df.format("{} ({}): {:,}")
     1    California (CA): 39,512,223
     2         Texas (TX): 28,995,881
     3       Florida (FL): 21,477,737

From db6d235b18ab1d013c0dfa4554ca69018dee9ae0 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Tue, 23 Jun 2020 00:31:20 +0100
Subject: [PATCH 3/3] Add tests for Series.format

---
 pandas/core/strings.py                     |  2 +-
 pandas/tests/series/methods/test_format.py | 64 ++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 pandas/tests/series/methods/test_format.py

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index b393fe66ce214..82ca688802dbc 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -318,7 +318,7 @@ def str_format(
     3    Population: 21,477,737
     dtype: string
 
-    >>>  df.format("{state_name} ({state_abbreviation}): {population:,}")
+    >>> df.format("{state_name} ({state_abbreviation}): {population:,}")
     1    California (CA): 39,512,223
     2         Texas (TX): 28,995,881
     3       Florida (FL): 21,477,737
diff --git a/pandas/tests/series/methods/test_format.py b/pandas/tests/series/methods/test_format.py
new file mode 100644
index 0000000000000..ebb68526b2e2d
--- /dev/null
+++ b/pandas/tests/series/methods/test_format.py
@@ -0,0 +1,64 @@
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestFormat:
+    @pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"])
+    @pytest.mark.parametrize("how_na", ["all", "any"])
+    def test_basic(self, format_str, how_na):
+        ser = pd.Series([1, 2, 3], name="A")
+        expected = pd.Series(
+            ["Value: 1", "Value: 2", "Value: 3"], dtype="string", name="X"
+        )
+
+        result = ser.format(format_str, how_na=how_na, name="X")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{Index}-{}", "{Index}-{A}"])
+    def test_with_index(self, format_str):
+        ser = pd.Series([1, 2, 3], name="A")
+        expected = pd.Series(["0-1", "1-2", "2-3"], dtype="string", name="X")
+
+        result = ser.format(format_str, name="X")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["Value: {}"])
+    @pytest.mark.parametrize("positional_only", [True, False])
+    def test_positional_only(self, format_str, positional_only):
+        ser = pd.Series([1, 2, 3], name="A")
+        expected = pd.Series(["Value: 1", "Value: 2", "Value: 3"], dtype="string")
+
+        result = ser.format(format_str, positional_only=positional_only)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_str", ["{A}-{}", "{Index}-{}"])
+    def test_positional_only_raises(self, format_str):
+        ser = pd.Series([1, 2, 3], name="A")
+        with pytest.raises(KeyError):
+            ser.format(format_str, positional_only=True)
+
+    @pytest.mark.parametrize(
+        "how_na, expected",
+        [("any", ["Value: 1", pd.NA, pd.NA]), ("all", ["Value: 1", pd.NA, pd.NA])],
+    )
+    @pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"])
+    def test_na_how(self, how_na, expected, format_str):
+        ser = pd.Series([1, pd.NA, pd.NA], name="A")
+        expected = pd.Series(expected, dtype="string")
+
+        result = ser.format("Value: {}", how_na=how_na)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format_string", ["{}-{}", "{0}-{1}"])
+    def test_too_many_positional_args(self, format_string):
+        ser = pd.Series([1, 2, 3], name="A")
+        with pytest.raises(IndexError):
+            ser.format(format_string)
+
+    @pytest.mark.parametrize("format_string", ["{A}-{B}", "{B}"])
+    def test_unknown_named_args(self, format_string):
+        ser = pd.Series([1, 2, 3], name="A")
+        with pytest.raises(KeyError):
+            ser.format(format_string)