Skip to content

Commit 25bf583

Browse files
committed
ENH: Added keep_whitespace and whitespace_chars to read_fwf, allowing
more control over handling of whitespace in fields and removing the requirement to specify a `delimiter` in order to preserve whitespace. (pandas-dev#51569) Signed-off-by: Ronald Barnes <[email protected]>
1 parent c4caed6 commit 25bf583

File tree

2 files changed

+74
-3
lines changed

2 files changed

+74
-3
lines changed

pandas/io/parsers/python_parser.py

+49-1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
104104
self.decimal = kwds["decimal"]
105105

106106
self.comment = kwds["comment"]
107+
## GH51569
108+
self.keep_whitespace = kwds.get("keep_whitespace")
109+
self.whitespace_chars = kwds.get("whitespace_chars")
107110

108111
# Set self.data to something that can read lines.
109112
if isinstance(f, list):
@@ -1180,11 +1183,20 @@ def __init__(
11801183
comment: str | None,
11811184
skiprows: set[int] | None = None,
11821185
infer_nrows: int = 100,
1186+
## GH51569
1187+
keep_whitespace: bool | tuple[bool, bool] = (False, False),
1188+
whitespace_chars: str = " \t",
11831189
) -> None:
11841190
self.f = f
11851191
self.buffer: Iterator | None = None
11861192
self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
11871193
self.comment = comment
1194+
self.keep_whitespace = keep_whitespace
1195+
## Backwards compatibility means supporting delimiter:
1196+
if delimiter:
1197+
whitespace_chars = whitespace_chars + delimiter
1198+
self.whitespace_chars = whitespace_chars
1199+
11881200
if colspecs == "infer":
11891201
self.colspecs = self.detect_colspecs(
11901202
infer_nrows=infer_nrows, skiprows=skiprows
@@ -1210,6 +1222,33 @@ def __init__(
12101222
"2 element tuple or list of integers"
12111223
)
12121224

1225+
## GH51569
1226+
## Accept boolean, but convert to tuple(bool,bool) for (left,right) of fields:
1227+
if isinstance(self.keep_whitespace, bool):
1228+
self.keep_whitespace = (keep_whitespace, keep_whitespace)
1229+
## Ensure tuple is (bool,bool):
1230+
if (
1231+
isinstance(self.keep_whitespace, tuple)
1232+
and len(self.keep_whitespace) == 2
1233+
and isinstance(self.keep_whitespace[0], bool)
1234+
and isinstance(self.keep_whitespace[1], bool)
1235+
):
1236+
# Define custom lstrip & rstrip *once*, at __init__:
1237+
if self.keep_whitespace[0] is True:
1238+
self.ltrim = lambda x: x
1239+
else:
1240+
self.ltrim = lambda x: x.lstrip(self.whitespace_chars)
1241+
if self.keep_whitespace[1] is True:
1242+
self.rtrim = lambda x: x
1243+
else:
1244+
self.rtrim = lambda x: x.rstrip(self.whitespace_chars)
1245+
else:
1246+
raise ValueError(
1247+
"'keep_whitespace' must be a bool or tuple(bool,bool)."
1248+
f"\nReceived '{type(self.keep_whitespace).__name__}': "
1249+
f"'{self.keep_whitespace}'."
1250+
)
1251+
12131252
def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
12141253
"""
12151254
Read rows from self.f, skipping as specified.
@@ -1281,8 +1320,14 @@ def __next__(self) -> list[str]:
12811320
line = next(self.f) # type: ignore[arg-type]
12821321
else:
12831322
line = next(self.f) # type: ignore[arg-type]
1323+
1324+
line = line.rstrip("\r\n")
1325+
12841326
# Note: 'colspecs' is a sequence of half-open intervals.
1285-
return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
1327+
return [self.ltrim(self.rtrim(line[from_:to])) for (from_, to) in self.colspecs]
1328+
1329+
1330+
# return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
12861331

12871332

12881333
class FixedWidthFieldParser(PythonParser):
@@ -1305,6 +1350,9 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
13051350
self.comment,
13061351
self.skiprows,
13071352
self.infer_nrows,
1353+
## GH51569
1354+
self.keep_whitespace,
1355+
self.whitespace_chars,
13081356
)
13091357

13101358
def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:

pandas/io/parsers/readers.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,13 @@
435435
"float_precision": None,
436436
}
437437

438-
_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
438+
_fwf_defaults = {
439+
"colspecs": "infer",
440+
"infer_nrows": 100,
441+
"widths": None,
442+
"keep_whitespace": (False, False),
443+
"whitespace_chars": " \t",
444+
}
439445

440446
_c_unsupported = {"skipfooter"}
441447
_python_unsupported = {"low_memory", "float_precision"}
@@ -1235,10 +1241,13 @@ def read_fwf(
12351241
widths: Sequence[int] | None = None,
12361242
infer_nrows: int = 100,
12371243
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
1244+
## GH51569
1245+
keep_whitespace: bool | tuple[bool, bool] = (False, False),
1246+
whitespace_chars: str = " \t",
12381247
**kwds,
12391248
) -> DataFrame | TextFileReader:
12401249
r"""
1241-
Read a table of fixed-width formatted lines into DataFrame.
1250+
Read a file of fixed-width lines into DataFrame.
12421251
12431252
Also supports optionally iterating or breaking of the file
12441253
into chunks.
@@ -1266,6 +1275,8 @@ def read_fwf(
12661275
infer_nrows : int, default 100
12671276
The number of rows to consider when letting the parser determine the
12681277
`colspecs`.
1278+
delimiter : str, default ``' '`` and ``'\t'`` characters
1279+
When inferring colspecs, sets the column / field separator.
12691280
use_nullable_dtypes : bool = False
12701281
Whether or not to use nullable dtypes as default when reading data. If
12711282
set to True, nullable dtypes are used for all dtypes that have a nullable
@@ -1283,6 +1294,14 @@ def read_fwf(
12831294
12841295
.. versionadded:: 2.0
12851296
1297+
keep_whitespace : bool, or tuple (bool,bool), default (False,False)
1298+
How to handle whitespace at start,end of each field / column.
1299+
whitespace_chars : str, default = ``' '`` and ``'\t'`` characters
1300+
If ``keep_whitespace`` is to remove whitespace, these characters are
1301+
stripped from each field / column.
1302+
1303+
.. versionadded:: 2.0
1304+
12861305
**kwds : optional
12871306
Optional keyword arguments can be passed to ``TextFileReader``.
12881307
@@ -1294,6 +1313,7 @@ def read_fwf(
12941313
12951314
See Also
12961315
--------
1316+
read_table : Read data from table (i.e. columns with delimiting spaces).
12971317
DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
12981318
read_csv : Read a comma-separated values (csv) file into DataFrame.
12991319
@@ -1346,6 +1366,9 @@ def read_fwf(
13461366
kwds["infer_nrows"] = infer_nrows
13471367
kwds["engine"] = "python-fwf"
13481368
kwds["use_nullable_dtypes"] = use_nullable_dtypes
1369+
## GH51569
1370+
kwds["keep_whitespace"] = keep_whitespace
1371+
kwds["whitespace_chars"] = whitespace_chars
13491372
return _read(filepath_or_buffer, kwds)
13501373

13511374

0 commit comments

Comments
 (0)