Skip to content

Commit 06d72d3

Browse files
committed
ENH: Add NDFrame.format
1 parent 506eb54 commit 06d72d3

File tree

6 files changed

+223
-0
lines changed

6 files changed

+223
-0
lines changed

doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Conversion
4646
DataFrame.astype
4747
DataFrame.convert_dtypes
4848
DataFrame.infer_objects
49+
DataFrame.format
4950
DataFrame.copy
5051
DataFrame.bool
5152

doc/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Conversion
4848
Series.astype
4949
Series.convert_dtypes
5050
Series.infer_objects
51+
Series.format
5152
Series.copy
5253
Series.bool
5354
Series.to_numpy

doc/source/whatsnew/v1.1.0.rst

+32
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,38 @@ For example, the below now works:
3030
ser[0]
3131
pd.Series([1, 2, np.nan], dtype="Int64").astype("string")
3232
33+
.. _whatsnew_110.format:
34+
35+
``DataFrame.format`` and ``Series.format`` for complex conversion to StringDtype
36+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
37+
38+
New methods :meth:`DataFrame.format` and :meth:`Series.format` have been added (:issue:`xxxxx`).
39+
These methods allow creating a ``string`` Series from arbitrary ``Series`` or ``DataFrame`` using standard python format strings:
40+
41+
.. ipython:: python
42+
43+
df = pd.DataFrame({
44+
'state_name': ['California', 'Texas', 'Florida'],
45+
'state_abbreviation': ['CA', 'TX', 'FL'],
46+
'population': [39_512_223, 28_995_881, 21_477_737],
47+
}, index=[1, 2, 3])
48+
df
49+
ser = df["population"]
50+
51+
df.format("{state_name} ({state_abbreviation}): {population:,}")
52+
53+
ser.format("Population: {population:,}")
54+
55+
The output Series will always have dtype :class:`StringDtype`.
56+
57+
Formatting using positional arguments is also possible (``positional_only=True`` is not necessary, but by disallowing keyword parameters performance is improved):
58+
59+
.. ipython:: python
60+
61+
df.format("{} ({}): {:,}", positional_only=True)
62+
63+
ser.format("Population: {:,}", positional_only=True)
64+
3365
3466
.. _whatsnew_110.period_index_partial_string_slicing:
3567

pandas/core/generic.py

+15
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
from pandas.core.internals import BlockManager
9898
from pandas.core.missing import find_valid_index
9999
from pandas.core.ops import _align_method_FRAME
100+
from pandas.core.strings import str_format
100101

101102
from pandas.io.formats import format as fmt
102103
from pandas.io.formats.format import DataFrameFormatter, format_percentiles
@@ -105,6 +106,7 @@
105106

106107
if TYPE_CHECKING:
107108
from pandas.core.resample import Resampler
109+
from pandas.core.series import Series
108110

109111
# goal is to be able to define the docs close to function, while still being
110112
# able to share
@@ -3742,6 +3744,19 @@ def __delitem__(self, key) -> None:
37423744
# ----------------------------------------------------------------------
37433745
# Unsorted
37443746

3747+
@doc(str_format)
3748+
def format(
3749+
self,
3750+
format: str,
3751+
name: Optional[str] = None,
3752+
positional_only: bool = False,
3753+
how_na: str = "any",
3754+
) -> "Series":
3755+
3756+
return str_format(
3757+
self, format, name=name, positional_only=positional_only, how_na=how_na
3758+
)
3759+
37453760
def get(self, key, default=None):
37463761
"""
37473762
Get item from object for given key (ex: DataFrame column).

pandas/core/strings.py

+112
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from pandas.core.construction import extract_array
4040

4141
if TYPE_CHECKING:
42+
from pandas import Series
4243
from pandas.arrays import StringArray
4344

4445
_cpython_optimized_encoders = (
@@ -241,6 +242,117 @@ def g(x):
241242
return lib.map_infer(arr, f)
242243

243244

245+
def str_format(
246+
arr,
247+
format: str,
248+
name: str = None,
249+
positional_only: bool = False,
250+
how_na: str = "any",
251+
) -> "Series":
252+
"""
253+
Format rows according to the format and return a Series with one string per row.
254+
255+
Parameters
256+
----------
257+
arr: DataFrame or Series
258+
The values to format.
259+
format : str
260+
format string.
261+
name: Label, optional
262+
The name of the returned Series.
263+
positional_only: bool, default False
264+
If True, only allow positional parameters (i.e. allow "{}", but not "{key}").
265+
Setting to ``True`` will improve performance.
266+
how_na: str, one of {"all", "any"}, default "any"
267+
If "all", return ``NA`` if all values in row are nan values.
268+
If "any", return ``NA`` if at least one of the values in row is a nan value.
269+
270+
Returns
271+
-------
272+
Series
273+
A Series with dtype ``StringDtype``, formatted according to ``format``.
274+
275+
Examples
276+
--------
277+
>>> df = pd.DataFrame({
278+
... 'state_name': ['California', 'Texas', 'Florida'],
279+
... 'state_abbreviation': ['CA', 'TX', 'FL'],
280+
... 'population': [39_512_223, 28_995_881, 21_477_737],
281+
... }, index=[1, 2, 3])
282+
>>> df
283+
state_name state_abbreviation population
284+
1 California CA 39512223
285+
2 Texas TX 28995881
286+
3 Florida FL 21477737
287+
>>> ser = df["population"]
288+
289+
Formatting using positional arguments:
290+
291+
>>> ser.format("Population: {:,}")
292+
1 Population: 39,512,223
293+
2 Population: 28,995,881
294+
3 Population: 21,477,737
295+
dtype: string
296+
297+
>>> df.format("{} ({}): {:,}")
298+
1 California (CA): 39,512,223
299+
2 Texas (TX): 28,995,881
300+
3 Florida (FL): 21,477,737
301+
dtype: string
302+
303+
Using keyword arguments (only works if column labels are strings):
304+
305+
>>> ser.format("Population: {population:,}")
306+
1 Population: 39,512,223
307+
2 Population: 28,995,881
308+
3 Population: 21,477,737
309+
dtype: string
310+
311+
>>> df.format("{state_name} ({state_abbreviation}): {population:,}")
312+
1 California (CA): 39,512,223
313+
2 Texas (TX): 28,995,881
314+
3 Florida (FL): 21,477,737
315+
dtype: string
316+
317+
The index can be added using the keyword 'Index':
318+
319+
>>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})")
320+
1 California (CA): 39,512,223 (no. 1)
321+
2 Texas (TX): 28,995,881 (no. 2)
322+
3 Florida (FL): 21,477,737 (no. 3)
323+
dtype: string
324+
"""
325+
from pandas import NA
326+
from pandas.arrays import StringArray
327+
328+
if not isinstance(arr, ABCDataFrame):
329+
result_wrapper = arr._constructor
330+
arr_name = arr.name if arr.name is not None else "_1"
331+
arr = arr.to_frame(name=arr_name)
332+
else:
333+
result_wrapper = arr._constructor_sliced
334+
335+
na_mask = isna(arr)
336+
if how_na == "any":
337+
na_mask = na_mask.any(axis=1)
338+
elif how_na == "all":
339+
na_mask = na_mask.all(axis=1)
340+
else:
341+
raise ValueError(how_na)
342+
343+
func = format.format
344+
if positional_only:
345+
named_tups = arr.itertuples(index=False)
346+
result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object)
347+
else:
348+
named_tups = arr.itertuples()
349+
res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups]
350+
result = np.array(res, dtype=object)
351+
352+
result[na_mask] = NA
353+
return result_wrapper(StringArray(result), index=arr.index.copy(), name=name)
354+
355+
244356
def str_count(arr, pat, flags=0):
245357
"""
246358
Count occurrences of pattern in each string of the Series/Index.
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import pytest
2+
3+
import pandas as pd
4+
import pandas._testing as tm
5+
6+
7+
class TestFormat:
8+
@pytest.mark.parametrize("format_str", ["{}-{}", "{A}-{B}", "{}-{B}"])
9+
@pytest.mark.parametrize("name", [None, "X"])
10+
@pytest.mark.parametrize("how_na", ["all", "any"])
11+
def test_basic(self, format_str, name, how_na):
12+
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
13+
expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string", name=name)
14+
15+
result = df.format(format_str, name=name, how_na=how_na)
16+
tm.assert_series_equal(result, expected)
17+
18+
@pytest.mark.parametrize("format_str", ["{Index}-{}-{}", "{Index}-{A}-{B}"])
19+
def test_with_index(self, format_str):
20+
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
21+
expected = pd.Series(["0-1-4", "1-2-5", "2-3-6"], dtype="string")
22+
23+
result = df.format(format_str)
24+
tm.assert_series_equal(result, expected)
25+
26+
@pytest.mark.parametrize("format_str", ["{}-{}"])
27+
@pytest.mark.parametrize("positional_only", [True, False])
28+
def test_positional_only(self, format_str, positional_only):
29+
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
30+
expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string")
31+
32+
result = df.format(format_str, positional_only=positional_only)
33+
tm.assert_series_equal(result, expected)
34+
35+
@pytest.mark.parametrize("format_str", ["{A}-{B}", "{A}-{}", "{Index}-{}"])
36+
def test_positional_only_raises(self, format_str):
37+
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
38+
with pytest.raises(KeyError):
39+
df.format(format_str, positional_only=True)
40+
41+
@pytest.mark.parametrize(
42+
"how_na, expected",
43+
[("any", ["1-4", pd.NA, pd.NA]), ("all", ["1-4", "nan-5", pd.NA])],
44+
)
45+
def test_na_how(self, how_na, expected):
46+
df = pd.DataFrame({"A": [1, None, None], "B": [4, 5, None]})
47+
expected = pd.Series(expected, dtype="string")
48+
49+
result = df.format("{:.0f}-{:.0f}", how_na=how_na)
50+
tm.assert_series_equal(result, expected)
51+
52+
@pytest.mark.parametrize("format_string", ["{}-{}-{}", "{0}-{1}-{2}"])
53+
def test_too_many_positional_args(self, format_string):
54+
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
55+
with pytest.raises(IndexError):
56+
df.format(format_string)
57+
58+
@pytest.mark.parametrize("format_string", ["{A}-{B}-{C}", "{C}"])
59+
def test_too_many_named_args(self, format_string):
60+
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
61+
with pytest.raises(KeyError):
62+
df.format(format_string)

0 commit comments

Comments
 (0)