-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add NDFrame.format for easier conversion to string dtype #34941
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,17 @@ | |
from functools import wraps | ||
import re | ||
import textwrap | ||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
Callable, | ||
Dict, | ||
List, | ||
Optional, | ||
Pattern, | ||
Type, | ||
Union, | ||
) | ||
import warnings | ||
|
||
import numpy as np | ||
|
@@ -39,6 +49,7 @@ | |
from pandas.core.construction import extract_array | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Series | ||
from pandas.arrays import StringArray | ||
|
||
_cpython_optimized_encoders = ( | ||
|
@@ -241,6 +252,117 @@ def g(x): | |
return lib.map_infer(arr, f) | ||
|
||
|
||
def str_format( | ||
arr, | ||
format: str, | ||
name: Optional[str] = None, | ||
positional_only: bool = False, | ||
how_na: str = "any", | ||
) -> "Series": | ||
""" | ||
Format rows according to the format and return a Series with one string per row. | ||
|
||
Parameters | ||
---------- | ||
arr : DataFrame or Series | ||
The values to format. | ||
format : str | ||
format string. | ||
name : Label, optional | ||
The name of the returned Series. | ||
positional_only : bool, default False | ||
If True, only allow positional parameters (i.e. allow "{}", but not "{key}"). | ||
Setting to ``True`` will improve performance. | ||
how_na : str, one of {"all", "any"}, default "any" | ||
If "all", return ``NA`` if all values in row are nan values. | ||
If "any", return ``NA`` if at least one of the values in row is a nan value. | ||
|
||
Returns | ||
------- | ||
Series | ||
A Series with dtype ``StringDtype``, formatted according to ``format``. | ||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame({ | ||
... 'state_name': ['California', 'Texas', 'Florida'], | ||
... 'state_abbreviation': ['CA', 'TX', 'FL'], | ||
... 'population': [39_512_223, 28_995_881, 21_477_737], | ||
... }, index=[1, 2, 3]) | ||
>>> df | ||
state_name state_abbreviation population | ||
1 California CA 39512223 | ||
2 Texas TX 28995881 | ||
3 Florida FL 21477737 | ||
>>> ser = df["population"] | ||
|
||
Formatting using positional arguments: | ||
|
||
>>> ser.format("Population: {:,}") | ||
1 Population: 39,512,223 | ||
2 Population: 28,995,881 | ||
3 Population: 21,477,737 | ||
dtype: string | ||
|
||
>>> df.format("{} ({}): {:,}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just thinking out loud but would it not be more useful to apply this formatting to the columns returning a like-shaped data frame instead of a series? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My thought was that users would rarely want to convert the whole frame, but usually selected columns and do it like this |
||
1 California (CA): 39,512,223 | ||
2 Texas (TX): 28,995,881 | ||
3 Florida (FL): 21,477,737 | ||
dtype: string | ||
|
||
Using keyword arguments (only works if column labels are strings): | ||
|
||
>>> ser.format("Population: {population:,}") | ||
1 Population: 39,512,223 | ||
2 Population: 28,995,881 | ||
3 Population: 21,477,737 | ||
dtype: string | ||
|
||
>>> df.format("{state_name} ({state_abbreviation}): {population:,}") | ||
1 California (CA): 39,512,223 | ||
2 Texas (TX): 28,995,881 | ||
3 Florida (FL): 21,477,737 | ||
dtype: string | ||
|
||
The index can be added using the keyword 'Index': | ||
|
||
>>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})") | ||
1 California (CA): 39,512,223 (no. 1) | ||
2 Texas (TX): 28,995,881 (no. 2) | ||
3 Florida (FL): 21,477,737 (no. 3) | ||
dtype: string | ||
""" | ||
from pandas import NA | ||
from pandas.arrays import StringArray | ||
|
||
if not isinstance(arr, ABCDataFrame): | ||
result_wrapper = arr._constructor | ||
arr_name = arr.name if arr.name is not None else "_1" | ||
arr = arr.to_frame(name=arr_name) | ||
else: | ||
result_wrapper = arr._constructor_sliced | ||
|
||
na_mask = isna(arr) | ||
if how_na == "any": | ||
na_mask = na_mask.any(axis=1) | ||
elif how_na == "all": | ||
na_mask = na_mask.all(axis=1) | ||
else: | ||
raise ValueError(how_na) | ||
|
||
func = format.format | ||
if positional_only: | ||
named_tups = arr.itertuples(index=False) | ||
result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object) | ||
else: | ||
named_tups = arr.itertuples() | ||
res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups] | ||
result = np.array(res, dtype=object) | ||
|
||
result[na_mask] = NA | ||
return result_wrapper(StringArray(result), index=arr.index.copy(), name=name) | ||
|
||
|
||
def str_count(arr, pat, flags=0): | ||
""" | ||
Count occurrences of pattern in each string of the Series/Index. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import pytest | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
|
||
|
||
class TestFormat: | ||
@pytest.mark.parametrize("format_str", ["{}-{}", "{A}-{B}", "{}-{B}"]) | ||
@pytest.mark.parametrize("name", [None, "X"]) | ||
@pytest.mark.parametrize("how_na", ["all", "any"]) | ||
def test_basic(self, format_str, name, how_na): | ||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||
expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string", name=name) | ||
|
||
result = df.format(format_str, name=name, how_na=how_na) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_str", ["{Index}-{}-{}", "{Index}-{A}-{B}"]) | ||
def test_with_index(self, format_str): | ||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||
expected = pd.Series(["0-1-4", "1-2-5", "2-3-6"], dtype="string") | ||
|
||
result = df.format(format_str) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_str", ["{}-{}"]) | ||
@pytest.mark.parametrize("positional_only", [True, False]) | ||
def test_positional_only(self, format_str, positional_only): | ||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||
expected = pd.Series(["1-4", "2-5", "3-6"], dtype="string") | ||
|
||
result = df.format(format_str, positional_only=positional_only) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_str", ["{A}-{B}", "{A}-{}", "{Index}-{}"]) | ||
def test_positional_only_raises(self, format_str): | ||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||
with pytest.raises(KeyError): | ||
df.format(format_str, positional_only=True) | ||
|
||
@pytest.mark.parametrize( | ||
"how_na, expected", | ||
[("any", ["1-4", pd.NA, pd.NA]), ("all", ["1-4", "nan-5", pd.NA])], | ||
) | ||
def test_na_how(self, how_na, expected): | ||
df = pd.DataFrame({"A": [1, None, None], "B": [4, 5, None]}) | ||
expected = pd.Series(expected, dtype="string") | ||
|
||
result = df.format("{:.0f}-{:.0f}", how_na=how_na) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_string", ["{}-{}-{}", "{0}-{1}-{2}"]) | ||
def test_too_many_positional_args(self, format_string): | ||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||
with pytest.raises(IndexError): | ||
df.format(format_string) | ||
|
||
@pytest.mark.parametrize("format_string", ["{A}-{B}-{C}", "{C}"]) | ||
def test_too_many_named_args(self, format_string): | ||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) | ||
with pytest.raises(KeyError): | ||
df.format(format_string) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import pytest | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
|
||
|
||
class TestFormat: | ||
@pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"]) | ||
@pytest.mark.parametrize("how_na", ["all", "any"]) | ||
def test_basic(self, format_str, how_na): | ||
ser = pd.Series([1, 2, 3], name="A") | ||
expected = pd.Series( | ||
["Value: 1", "Value: 2", "Value: 3"], dtype="string", name="X" | ||
) | ||
|
||
result = ser.format(format_str, how_na=how_na, name="X") | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_str", ["{Index}-{}", "{Index}-{A}"]) | ||
def test_with_index(self, format_str): | ||
ser = pd.Series([1, 2, 3], name="A") | ||
expected = pd.Series(["0-1", "1-2", "2-3"], dtype="string", name="X") | ||
|
||
result = ser.format(format_str, name="X") | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_str", ["Value: {}"]) | ||
@pytest.mark.parametrize("positional_only", [True, False]) | ||
def test_positional_only(self, format_str, positional_only): | ||
ser = pd.Series([1, 2, 3], name="A") | ||
expected = pd.Series(["Value: 1", "Value: 2", "Value: 3"], dtype="string") | ||
|
||
result = ser.format(format_str, positional_only=positional_only) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_str", ["{A}-{}", "{Index}-{}"]) | ||
def test_positional_only_raises(self, format_str): | ||
ser = pd.Series([1, 2, 3], name="A") | ||
with pytest.raises(KeyError): | ||
ser.format(format_str, positional_only=True) | ||
|
||
@pytest.mark.parametrize( | ||
"how_na, expected", | ||
[("any", ["Value: 1", pd.NA, pd.NA]), ("all", ["Value: 1", pd.NA, pd.NA])], | ||
) | ||
@pytest.mark.parametrize("format_str", ["Value: {}", "Value: {A}"]) | ||
def test_na_how(self, how_na, expected, format_str): | ||
ser = pd.Series([1, pd.NA, pd.NA], name="A") | ||
expected = pd.Series(expected, dtype="string") | ||
|
||
result = ser.format("Value: {}", how_na=how_na) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("format_string", ["{}-{}", "{0}-{1}"]) | ||
def test_too_many_positional_args(self, format_string): | ||
ser = pd.Series([1, 2, 3], name="A") | ||
with pytest.raises(IndexError): | ||
ser.format(format_string) | ||
|
||
@pytest.mark.parametrize("format_string", ["{A}-{B}", "{B}"]) | ||
def test_unknown_named_args(self, format_string): | ||
ser = pd.Series([1, 2, 3], name="A") | ||
with pytest.raises(KeyError): | ||
ser.format(format_string) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we actually need this parameter? isn't positional obvious if you don't have field labels?