ENH: Added keep_whitespace and whitespace_chars to read_fwf, allowing

RonaldBarnes · RonaldBarnes · commit 25bf5838452e · 2023-02-23T00:55:53.000-08:00
more control over handling of whitespace in fields and removing the requirement to specify a `delimiter` in order to preserve whitespace. (pandas-dev#51569) Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -104,6 +104,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
         self.decimal = kwds["decimal"]
 
         self.comment = kwds["comment"]
+        ## GH51569
+        self.keep_whitespace = kwds.get("keep_whitespace")
+        self.whitespace_chars = kwds.get("whitespace_chars")
 
         # Set self.data to something that can read lines.
         if isinstance(f, list):
@@ -1180,11 +1183,20 @@ def __init__(
         comment: str | None,
         skiprows: set[int] | None = None,
         infer_nrows: int = 100,
+        ## GH51569
+        keep_whitespace: bool | tuple[bool, bool] = (False, False),
+        whitespace_chars: str = " \t",
     ) -> None:
         self.f = f
         self.buffer: Iterator | None = None
         self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
         self.comment = comment
+        self.keep_whitespace = keep_whitespace
+        ## Backwards compatibility means supporting delimiter:
+        if delimiter:
+            whitespace_chars = whitespace_chars + delimiter
+        self.whitespace_chars = whitespace_chars
+
         if colspecs == "infer":
             self.colspecs = self.detect_colspecs(
                 infer_nrows=infer_nrows, skiprows=skiprows
@@ -1210,6 +1222,33 @@ def __init__(
                     "2 element tuple or list of integers"
                 )
 
+        ## GH51569
+        ## Accept boolean, but convert to tuple(bool,bool) for (left,right) of fields:
+        if isinstance(self.keep_whitespace, bool):
+            self.keep_whitespace = (keep_whitespace, keep_whitespace)
+        ## Ensure tuple is (bool,bool):
+        if (
+            isinstance(self.keep_whitespace, tuple)
+            and len(self.keep_whitespace) == 2
+            and isinstance(self.keep_whitespace[0], bool)
+            and isinstance(self.keep_whitespace[1], bool)
+        ):
+            # Define custom lstrip & rstrip *once*, at __init__:
+            if self.keep_whitespace[0] is True:
+                self.ltrim = lambda x: x
+            else:
+                self.ltrim = lambda x: x.lstrip(self.whitespace_chars)
+            if self.keep_whitespace[1] is True:
+                self.rtrim = lambda x: x
+            else:
+                self.rtrim = lambda x: x.rstrip(self.whitespace_chars)
+        else:
+            raise ValueError(
+                "'keep_whitespace' must be a bool or tuple(bool,bool)."
+                f"\nReceived '{type(self.keep_whitespace).__name__}': "
+                f"'{self.keep_whitespace}'."
+            )
+
     def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
         """
         Read rows from self.f, skipping as specified.
@@ -1281,8 +1320,14 @@ def __next__(self) -> list[str]:
                 line = next(self.f)  # type: ignore[arg-type]
         else:
             line = next(self.f)  # type: ignore[arg-type]
+
+        line = line.rstrip("\r\n")
+
         # Note: 'colspecs' is a sequence of half-open intervals.
-        return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
+        return [self.ltrim(self.rtrim(line[from_:to])) for (from_, to) in self.colspecs]
+
+
+#        return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
 
 
 class FixedWidthFieldParser(PythonParser):
@@ -1305,6 +1350,9 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
             self.comment,
             self.skiprows,
             self.infer_nrows,
+            ## GH51569
+            self.keep_whitespace,
+            self.whitespace_chars,
         )
 
     def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -435,7 +435,13 @@
     "float_precision": None,
 }
 
-_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
+_fwf_defaults = {
+    "colspecs": "infer",
+    "infer_nrows": 100,
+    "widths": None,
+    "keep_whitespace": (False, False),
+    "whitespace_chars": " \t",
+}
 
 _c_unsupported = {"skipfooter"}
 _python_unsupported = {"low_memory", "float_precision"}
@@ -1235,10 +1241,13 @@ def read_fwf(
     widths: Sequence[int] | None = None,
     infer_nrows: int = 100,
     use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
+    ## GH51569
+    keep_whitespace: bool | tuple[bool, bool] = (False, False),
+    whitespace_chars: str = " \t",
     **kwds,
 ) -> DataFrame | TextFileReader:
     r"""
-    Read a table of fixed-width formatted lines into DataFrame.
+    Read a file of fixed-width lines into DataFrame.
 
     Also supports optionally iterating or breaking of the file
     into chunks.
@@ -1266,6 +1275,8 @@ def read_fwf(
     infer_nrows : int, default 100
         The number of rows to consider when letting the parser determine the
         `colspecs`.
+    delimiter : str, default ``' '`` and ``'\t'`` characters
+        When inferring colspecs, sets the column / field separator.
     use_nullable_dtypes : bool = False
         Whether or not to use nullable dtypes as default when reading data. If
         set to True, nullable dtypes are used for all dtypes that have a nullable
@@ -1283,6 +1294,14 @@ def read_fwf(
 
         .. versionadded:: 2.0
 
+    keep_whitespace : bool, or tuple (bool,bool), default (False,False)
+        How to handle whitespace at start,end of each field / column.
+    whitespace_chars : str, default = ``' '`` and ``'\t'`` characters
+        If ``keep_whitespace`` is to remove whitespace, these characters are
+        stripped from each field / column.
+
+        .. versionadded:: 2.0
+
     **kwds : optional
         Optional keyword arguments can be passed to ``TextFileReader``.
 
@@ -1294,6 +1313,7 @@ def read_fwf(
 
     See Also
     --------
+    read_table : Read data from table (i.e. columns with delimiting spaces).
     DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
     read_csv : Read a comma-separated values (csv) file into DataFrame.
 
@@ -1346,6 +1366,9 @@ def read_fwf(
     kwds["infer_nrows"] = infer_nrows
     kwds["engine"] = "python-fwf"
     kwds["use_nullable_dtypes"] = use_nullable_dtypes
+    ## GH51569
+    kwds["keep_whitespace"] = keep_whitespace
+    kwds["whitespace_chars"] = whitespace_chars
     return _read(filepath_or_buffer, kwds)