Skip to content

ENH: Add NDFrame.format for easier conversion to string dtype #34941

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Conversion
DataFrame.astype
DataFrame.convert_dtypes
DataFrame.infer_objects
DataFrame.format
DataFrame.copy
DataFrame.bool

Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Conversion
Series.astype
Series.convert_dtypes
Series.infer_objects
Series.format
Series.copy
Series.bool
Series.to_numpy
Expand Down
29 changes: 29 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,35 @@ For example, the below now works:
ser[0]
pd.Series([1, 2, np.nan], dtype="Int64").astype("string")

.. _whatsnew_110.format:

``DataFrame.format`` and ``Series.format`` for complex conversion to StringDtype
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

New methods :meth:`DataFrame.format` and :meth:`Series.format` have been added (:issue:`xxxxx`).
These methods allow creating a ``string`` Series from arbitrary ``Series`` or ``DataFrame`` using standard python format strings:

.. ipython:: python

df = pd.DataFrame({
'state_name': ['California', 'Texas', 'Florida'],
'state_abbreviation': ['CA', 'TX', 'FL'],
'population': [39_512_223, 28_995_881, 21_477_737],
}, index=[1, 2, 3])
df
ser = df["population"]
df.format("{state_name} ({state_abbreviation}): {population:,}")
ser.format("Population: {population:,}")

The output Series will always have dtype :class:`StringDtype`.

Formatting using positional arguments is also possible (``positional_only=True`` is not necessary, but by disallowing keyword parameters performance is improved):

.. ipython:: python

df.format("{} ({}): {:,}", positional_only=True)
ser.format("Population: {:,}", positional_only=True)


.. _whatsnew_110.period_index_partial_string_slicing:

Expand Down
15 changes: 15 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
from pandas.core.internals import BlockManager
from pandas.core.missing import find_valid_index
from pandas.core.ops import _align_method_FRAME
from pandas.core.strings import str_format

from pandas.io.formats import format as fmt
from pandas.io.formats.format import DataFrameFormatter, format_percentiles
Expand All @@ -105,6 +106,7 @@

if TYPE_CHECKING:
from pandas.core.resample import Resampler
from pandas.core.series import Series

# goal is to be able to define the docs close to function, while still being
# able to share
Expand Down Expand Up @@ -3742,6 +3744,19 @@ def __delitem__(self, key) -> None:
# ----------------------------------------------------------------------
# Unsorted

@doc(str_format)
def format(
self,
format: str,
name: Optional[str] = None,
positional_only: bool_t = False,
how_na: str = "any",
) -> "Series":

return str_format(
self, format, name=name, positional_only=positional_only, how_na=how_na
)

def get(self, key, default=None):
"""
Get item from object for given key (ex: DataFrame column).
Expand Down
124 changes: 123 additions & 1 deletion pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@
from functools import wraps
import re
import textwrap
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
List,
Optional,
Pattern,
Type,
Union,
)
import warnings

import numpy as np
Expand Down Expand Up @@ -39,6 +49,7 @@
from pandas.core.construction import extract_array

if TYPE_CHECKING:
from pandas import Series
from pandas.arrays import StringArray

_cpython_optimized_encoders = (
Expand Down Expand Up @@ -241,6 +252,117 @@ def g(x):
return lib.map_infer(arr, f)


def str_format(
arr,
format: str,
name: Optional[str] = None,
positional_only: bool = False,
how_na: str = "any",
) -> "Series":
"""
Format rows according to the format and return a Series with one string per row.

Parameters
----------
arr : DataFrame or Series
The values to format.
format : str
format string.
name : Label, optional
The name of the returned Series.
positional_only : bool, default False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we actually need this parameter? isn't positional obvious if you don't have field labels?

If True, only allow positional parameters (i.e. allow "{}", but not "{key}").
Setting to ``True`` will improve performance.
how_na : str, one of {"all", "any"}, default "any"
If "all", return ``NA`` if all values in row are nan values.
If "any", return ``NA`` if at least one of the values in row is a nan value.

Returns
-------
Series
A Series with dtype ``StringDtype``, formatted according to ``format``.

Examples
--------
>>> df = pd.DataFrame({
... 'state_name': ['California', 'Texas', 'Florida'],
... 'state_abbreviation': ['CA', 'TX', 'FL'],
... 'population': [39_512_223, 28_995_881, 21_477_737],
... }, index=[1, 2, 3])
>>> df
state_name state_abbreviation population
1 California CA 39512223
2 Texas TX 28995881
3 Florida FL 21477737
>>> ser = df["population"]

Formatting using positional arguments:

>>> ser.format("Population: {:,}")
1 Population: 39,512,223
2 Population: 28,995,881
3 Population: 21,477,737
dtype: string

>>> df.format("{} ({}): {:,}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just thinking out loud but would it not be more useful to apply this formatting to the columns returning a like-shaped data frame instead of a series?

Copy link
Contributor Author

@topper-123 topper-123 Jun 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thought was that users would rarely want to convert the whole frame, but usually selected columns and do it like this df.assign(x=df['x'].format(...), y=df['y'].format(...)). Maybe have a combine=True keyword argument, that if set False, would work on single columns?

1 California (CA): 39,512,223
2 Texas (TX): 28,995,881
3 Florida (FL): 21,477,737
dtype: string

Using keyword arguments (only works if column labels are strings):

>>> ser.format("Population: {population:,}")
1 Population: 39,512,223
2 Population: 28,995,881
3 Population: 21,477,737
dtype: string

>>> df.format("{state_name} ({state_abbreviation}): {population:,}")
1 California (CA): 39,512,223
2 Texas (TX): 28,995,881
3 Florida (FL): 21,477,737
dtype: string

The index can be added using the keyword 'Index':

>>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})")
1 California (CA): 39,512,223 (no. 1)
2 Texas (TX): 28,995,881 (no. 2)
3 Florida (FL): 21,477,737 (no. 3)
dtype: string
"""
from pandas import NA
from pandas.arrays import StringArray

if not isinstance(arr, ABCDataFrame):
result_wrapper = arr._constructor
arr_name = arr.name if arr.name is not None else "_1"
arr = arr.to_frame(name=arr_name)
else:
result_wrapper = arr._constructor_sliced

na_mask = isna(arr)
if how_na == "any":
na_mask = na_mask.any(axis=1)
elif how_na == "all":
na_mask = na_mask.all(axis=1)
else:
raise ValueError(how_na)

func = format.format
if positional_only:
named_tups = arr.itertuples(index=False)
result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object)
else:
named_tups = arr.itertuples()
res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups]
result = np.array(res, dtype=object)

result[na_mask] = NA
return result_wrapper(StringArray(result), index=arr.index.copy(), name=name)


def str_count(arr, pat, flags=0):
"""
Count occurrences of pattern in each string of the Series/Index.
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/frame/methods/test_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest

import pandas as pd
import pandas._testing as tm


class TestFormat:
@pytest.mark.parametrize("format_str", ["{}-{}", "{A}-{B}", "{}-{B}"])
@pytest.mark.parametrize("name", [None, "X"])
@pytest.mark.parametrize("how_na", ["all", "any"])
def test_basic(self, format_str, name, how_na):
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string", name=name)

result = df.format(format_str, name=name, how_na=how_na)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_str", ["{Index}-{}-{}", "{Index}-{A}-{B}"])
def test_with_index(self, format_str):
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
expected = pd.Series(["0-1-4", "1-2-5", "2-3-6"], dtype="string")

result = df.format(format_str)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_str", ["{}-{}"])
@pytest.mark.parametrize("positional_only", [True, False])
def test_positional_only(self, format_str, positional_only):
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string")

result = df.format(format_str, positional_only=positional_only)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_str", ["{A}-{B}", "{A}-{}", "{Index}-{}"])
def test_positional_only_raises(self, format_str):
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
with pytest.raises(KeyError):
df.format(format_str, positional_only=True)

@pytest.mark.parametrize(
"how_na, expected",
[("any", ["1-4", pd.NA, pd.NA]), ("all", ["1-4", "nan-5", pd.NA])],
)
def test_na_how(self, how_na, expected):
df = pd.DataFrame({"A": [1, None, None], "B": [4, 5, None]})
expected = pd.Series(expected, dtype="string")

result = df.format("{:.0f}-{:.0f}", how_na=how_na)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_string", ["{}-{}-{}", "{0}-{1}-{2}"])
def test_too_many_positional_args(self, format_string):
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
with pytest.raises(IndexError):
df.format(format_string)

@pytest.mark.parametrize("format_string", ["{A}-{B}-{C}", "{C}"])
def test_too_many_named_args(self, format_string):
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
with pytest.raises(KeyError):
df.format(format_string)
64 changes: 64 additions & 0 deletions pandas/tests/series/methods/test_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pytest

import pandas as pd
import pandas._testing as tm


class TestFormat:
@pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"])
@pytest.mark.parametrize("how_na", ["all", "any"])
def test_basic(self, format_str, how_na):
ser = pd.Series([1, 2, 3], name="A")
expected = pd.Series(
["Value: 1", "Value: 2", "Value: 3"], dtype="string", name="X"
)

result = ser.format(format_str, how_na=how_na, name="X")
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_str", ["{Index}-{}", "{Index}-{A}"])
def test_with_index(self, format_str):
ser = pd.Series([1, 2, 3], name="A")
expected = pd.Series(["0-1", "1-2", "2-3"], dtype="string", name="X")

result = ser.format(format_str, name="X")
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_str", ["Value: {}"])
@pytest.mark.parametrize("positional_only", [True, False])
def test_positional_only(self, format_str, positional_only):
ser = pd.Series([1, 2, 3], name="A")
expected = pd.Series(["Value: 1", "Value: 2", "Value: 3"], dtype="string")

result = ser.format(format_str, positional_only=positional_only)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_str", ["{A}-{}", "{Index}-{}"])
def test_positional_only_raises(self, format_str):
ser = pd.Series([1, 2, 3], name="A")
with pytest.raises(KeyError):
ser.format(format_str, positional_only=True)

@pytest.mark.parametrize(
"how_na, expected",
[("any", ["Value: 1", pd.NA, pd.NA]), ("all", ["Value: 1", pd.NA, pd.NA])],
)
@pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"])
def test_na_how(self, how_na, expected, format_str):
ser = pd.Series([1, pd.NA, pd.NA], name="A")
expected = pd.Series(expected, dtype="string")

result = ser.format("Value: {}", how_na=how_na)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("format_string", ["{}-{}", "{0}-{1}"])
def test_too_many_positional_args(self, format_string):
ser = pd.Series([1, 2, 3], name="A")
with pytest.raises(IndexError):
ser.format(format_string)

@pytest.mark.parametrize("format_string", ["{A}-{B}", "{B}"])
def test_unknown_named_args(self, format_string):
ser = pd.Series([1, 2, 3], name="A")
with pytest.raises(KeyError):
ser.format(format_string)