|
39 | 39 | from pandas.core.construction import extract_array
|
40 | 40 |
|
41 | 41 | if TYPE_CHECKING:
|
| 42 | + from pandas import Series |
42 | 43 | from pandas.arrays import StringArray
|
43 | 44 |
|
44 | 45 | _cpython_optimized_encoders = (
|
@@ -241,6 +242,117 @@ def g(x):
|
241 | 242 | return lib.map_infer(arr, f)
|
242 | 243 |
|
243 | 244 |
|
| 245 | +def str_format( |
| 246 | + arr, |
| 247 | + format: str, |
| 248 | + name: str = None, |
| 249 | + positional_only: bool = False, |
| 250 | + how_na: str = "any", |
| 251 | +) -> "Series": |
| 252 | + """ |
| 253 | + Format rows according to the format and return a Series with one string per row. |
| 254 | +
|
| 255 | + Parameters |
| 256 | + ---------- |
| 257 | + arr: DataFrame or Series |
| 258 | + The values to format. |
| 259 | + format : str |
| 260 | + format string. |
| 261 | + name: Label, optional |
| 262 | + The name of the returned Series. |
| 263 | + positional_only: bool, default False |
| 264 | + If True, only allow positional parameters (i.e. allow "{}", but not "{key}"). |
| 265 | + Setting to ``True`` will improve performance. |
| 266 | + how_na: str, one of {"all", "any"}, default "any" |
| 267 | + If "all", return ``NA`` if all values in row are nan values. |
| 268 | + If "any", return ``NA`` if at least one of the values in row is a nan value. |
| 269 | +
|
| 270 | + Returns |
| 271 | + ------- |
| 272 | + Series |
| 273 | + A Series with dtype ``StringDtype``, formatted according to ``format``. |
| 274 | +
|
| 275 | + Examples |
| 276 | + -------- |
| 277 | + >>> df = pd.DataFrame({ |
| 278 | + ... 'state_name': ['California', 'Texas', 'Florida'], |
| 279 | + ... 'state_abbreviation': ['CA', 'TX', 'FL'], |
| 280 | + ... 'population': [39_512_223, 28_995_881, 21_477_737], |
| 281 | + ... }, index=[1, 2, 3]) |
| 282 | + >>> df |
| 283 | + state_name state_abbreviation population |
| 284 | + 1 California CA 39512223 |
| 285 | + 2 Texas TX 28995881 |
| 286 | + 3 Florida FL 21477737 |
| 287 | + >>> ser = df["population"] |
| 288 | +
|
| 289 | + Formatting using positional arguments: |
| 290 | +
|
| 291 | + >>> ser.format("Population: {:,}") |
| 292 | + 1 Population: 39,512,223 |
| 293 | + 2 Population: 28,995,881 |
| 294 | + 3 Population: 21,477,737 |
| 295 | + dtype: string |
| 296 | +
|
| 297 | + >>> df.format("{} ({}): {:,}") |
| 298 | + 1 California (CA): 39,512,223 |
| 299 | + 2 Texas (TX): 28,995,881 |
| 300 | + 3 Florida (FL): 21,477,737 |
| 301 | + dtype: string |
| 302 | +
|
| 303 | + Using keyword arguments (only works if column labels are strings): |
| 304 | +
|
| 305 | + >>> ser.format("Population: {population:,}") |
| 306 | + 1 Population: 39,512,223 |
| 307 | + 2 Population: 28,995,881 |
| 308 | + 3 Population: 21,477,737 |
| 309 | + dtype: string |
| 310 | +
|
| 311 | + >>> df.format("{state_name} ({state_abbreviation}): {population:,}") |
| 312 | + 1 California (CA): 39,512,223 |
| 313 | + 2 Texas (TX): 28,995,881 |
| 314 | + 3 Florida (FL): 21,477,737 |
| 315 | + dtype: string |
| 316 | +
|
| 317 | + The index can be added using the keyword 'Index': |
| 318 | +
|
| 319 | + >>> df.format("{state_name} ({state_abbreviation}): {population:,} (no. {Index})") |
| 320 | + 1 California (CA): 39,512,223 (no. 1) |
| 321 | + 2 Texas (TX): 28,995,881 (no. 2) |
| 322 | + 3 Florida (FL): 21,477,737 (no. 3) |
| 323 | + dtype: string |
| 324 | + """ |
| 325 | + from pandas import NA |
| 326 | + from pandas.arrays import StringArray |
| 327 | + |
| 328 | + if not isinstance(arr, ABCDataFrame): |
| 329 | + result_wrapper = arr._constructor |
| 330 | + arr_name = arr.name if arr.name is not None else "_1" |
| 331 | + arr = arr.to_frame(name=arr_name) |
| 332 | + else: |
| 333 | + result_wrapper = arr._constructor_sliced |
| 334 | + |
| 335 | + na_mask = isna(arr) |
| 336 | + if how_na == "any": |
| 337 | + na_mask = na_mask.any(axis=1) |
| 338 | + elif how_na == "all": |
| 339 | + na_mask = na_mask.all(axis=1) |
| 340 | + else: |
| 341 | + raise ValueError(how_na) |
| 342 | + |
| 343 | + func = format.format |
| 344 | + if positional_only: |
| 345 | + named_tups = arr.itertuples(index=False) |
| 346 | + result = np.array([func(*named_tup) for named_tup in named_tups], dtype=object) |
| 347 | + else: |
| 348 | + named_tups = arr.itertuples() |
| 349 | + res = [func(*named_tup[1:], **named_tup._asdict()) for named_tup in named_tups] |
| 350 | + result = np.array(res, dtype=object) |
| 351 | + |
| 352 | + result[na_mask] = NA |
| 353 | + return result_wrapper(StringArray(result), index=arr.index.copy(), name=name) |
| 354 | + |
| 355 | + |
244 | 356 | def str_count(arr, pat, flags=0):
|
245 | 357 | """
|
246 | 358 | Count occurrences of pattern in each string of the Series/Index.
|
|
0 commit comments