Skip to content

Commit 5d80fd0

Browse files
committed
Preserve whitespace by default in read_fwf() by adding 'keep_whitespace' (default=True) and 'whitespace_chars' (default=[space] and [tab] chars). Deprecation warning for 'delimiter'.
See pandas-dev#49832 (comment) Signed-off-by: Ronald Barnes <[email protected]>
1 parent 859e4eb commit 5d80fd0

File tree

2 files changed

+65
-3
lines changed

2 files changed

+65
-3
lines changed

pandas/io/parsers/python_parser.py

+46-2
Original file line numberDiff line numberDiff line change
@@ -1181,10 +1181,13 @@ def __init__(
11811181
comment: str | None,
11821182
skiprows: set[int] | None = None,
11831183
infer_nrows: int = 100,
1184+
keep_whitespace: bool | None = True,
1185+
whitespace_chars: str | None = " \t",
11841186
) -> None:
11851187
self.f = f
11861188
self.buffer: Iterator | None = None
1187-
self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
1189+
self.keep_whitespace = keep_whitespace
1190+
self.whitespace_chars = whitespace_chars
11881191
self.comment = comment
11891192
if colspecs == "infer":
11901193
self.colspecs = self.detect_colspecs(
@@ -1211,6 +1214,36 @@ def __init__(
12111214
"2 element tuple or list of integers"
12121215
)
12131216

1217+
if not isinstance(self.keep_whitespace, bool):
1218+
raise TypeError(
1219+
"keep_whitespace must be type bool (True or False), "
1220+
f"input was type {type(self.keep_whitespace).__name__}: "
1221+
f'"{self.keep_whitespace}"'
1222+
)
1223+
if delimiter:
1224+
## Delimiters in fixed-width files removed:
1225+
## use colspecs, widths, or read_table()
1226+
import warnings
1227+
1228+
## See link regarding fixing anti-patterns & unexpected default behaviour:
1229+
## https://github.com/pandas-dev/pandas/pull/49832#discussion_r1030615937
1230+
##
1231+
## Deprecation warnings ignored by default, show them:
1232+
warnings.simplefilter("always")
1233+
warnings.formatwarning = (
1234+
lambda msg, cat, file, line, args1: f"NOTICE:\n{msg}\n\n"
1235+
f'{cat}\nFile "{file}", line {line} '
1236+
"in FixedWidthReader.__init__\n"
1237+
)
1238+
warnings.warn(
1239+
(
1240+
"Delimiters are deprecated in fixed-width files "
1241+
+ "- use colspecs or widths\n"
1242+
+ "See keep_whitespace in read_fwf(), also see read_table()."
1243+
),
1244+
DeprecationWarning,
1245+
)
1246+
12141247
def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
12151248
"""
12161249
Read rows from self.f, skipping as specified.
@@ -1283,7 +1316,14 @@ def __next__(self) -> list[str]:
12831316
else:
12841317
line = next(self.f) # type: ignore[arg-type]
12851318
# Note: 'colspecs' is a sequence of half-open intervals.
1286-
return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
1319+
line = line.rstrip("\r\n")
1320+
if self.keep_whitespace:
1321+
return [line[from_:to] for (from_, to) in self.colspecs]
1322+
else:
1323+
return [
1324+
line[from_:to].strip(self.whitespace_chars)
1325+
for (from_, to) in self.colspecs
1326+
]
12871327

12881328

12891329
class FixedWidthFieldParser(PythonParser):
@@ -1296,6 +1336,8 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
12961336
# Support iterators, convert to a list.
12971337
self.colspecs = kwds.pop("colspecs")
12981338
self.infer_nrows = kwds.pop("infer_nrows")
1339+
self.keep_whitespace = kwds.pop("keep_whitespace", True)
1340+
self.whitespace_chars = kwds.pop("whitespace_chars", " \t")
12991341
PythonParser.__init__(self, f, **kwds)
13001342

13011343
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
@@ -1306,6 +1348,8 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
13061348
self.comment,
13071349
self.skiprows,
13081350
self.infer_nrows,
1351+
self.keep_whitespace,
1352+
self.whitespace_chars,
13091353
)
13101354

13111355
def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:

pandas/io/parsers/readers.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,13 @@
438438
"float_precision": None,
439439
}
440440

441-
_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
441+
_fwf_defaults = {
442+
"colspecs": "infer",
443+
"infer_nrows": 100,
444+
"widths": None,
445+
"keep_whitespace": True,
446+
"whitespace_chars": " \t",
447+
}
442448

443449
_c_unsupported = {"skipfooter"}
444450
_python_unsupported = {"low_memory", "float_precision"}
@@ -1235,6 +1241,8 @@ def read_fwf(
12351241
widths: Sequence[int] | None = None,
12361242
infer_nrows: int = 100,
12371243
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
1244+
keep_whitespace: bool | None = True,
1245+
whitespace_chars: str | None = " \t",
12381246
**kwds,
12391247
) -> DataFrame | TextFileReader:
12401248
r"""
@@ -1273,6 +1281,14 @@ def read_fwf(
12731281
12741282
.. versionadded:: 2.0
12751283
1284+
keep_whitespace : bool, default True
1285+
Preserve or strip whitespace from fields.
1286+
whitespace_chars : str, default [space] & [tab]
1287+
If stripping whitespace, allows user to specify which
1288+
characters to strip (can be any characters).
1289+
1290+
.. versionadded:: 2.0
1291+
12761292
**kwds : optional
12771293
Optional keyword arguments can be passed to ``TextFileReader``.
12781294
@@ -1336,6 +1352,8 @@ def read_fwf(
13361352
kwds["infer_nrows"] = infer_nrows
13371353
kwds["engine"] = "python-fwf"
13381354
kwds["use_nullable_dtypes"] = use_nullable_dtypes
1355+
kwds["keep_whitespace"] = keep_whitespace
1356+
kwds["whitespace_chars"] = whitespace_chars
13391357
return _read(filepath_or_buffer, kwds)
13401358

13411359

0 commit comments

Comments
 (0)