From 25bf5838452e23943a89a80e7fa1c03e50040928 Mon Sep 17 00:00:00 2001
From: Ronald Barnes <ron@ronaldbarnes.ca>
Date: Thu, 23 Feb 2023 00:55:53 -0800
Subject: [PATCH 1/4] ENH: Added `keep_whitespace` and `whitespace_chars` to
 `read_fwf`, allowing more control over handling of whitespace in fields and
 removing the requirement to specify a `delimiter` in order to preserve
 whitespace. (#51569)

Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
---
 pandas/io/parsers/python_parser.py | 50 +++++++++++++++++++++++++++++-
 pandas/io/parsers/readers.py       | 27 ++++++++++++++--
 2 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 62a4e80147780..e1679bbdcf6b2 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -104,6 +104,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
         self.decimal = kwds["decimal"]
 
         self.comment = kwds["comment"]
+        ## GH51569
+        self.keep_whitespace = kwds.get("keep_whitespace")
+        self.whitespace_chars = kwds.get("whitespace_chars")
 
         # Set self.data to something that can read lines.
         if isinstance(f, list):
@@ -1180,11 +1183,20 @@ def __init__(
         comment: str | None,
         skiprows: set[int] | None = None,
         infer_nrows: int = 100,
+        ## GH51569
+        keep_whitespace: bool | tuple[bool, bool] = (False, False),
+        whitespace_chars: str = " \t",
     ) -> None:
         self.f = f
         self.buffer: Iterator | None = None
         self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
         self.comment = comment
+        self.keep_whitespace = keep_whitespace
+        ## Backwards compatibility means supporting delimiter:
+        if delimiter:
+            whitespace_chars = whitespace_chars + delimiter
+        self.whitespace_chars = whitespace_chars
+
         if colspecs == "infer":
             self.colspecs = self.detect_colspecs(
                 infer_nrows=infer_nrows, skiprows=skiprows
@@ -1210,6 +1222,33 @@ def __init__(
                     "2 element tuple or list of integers"
                 )
 
+        ## GH51569
+        ## Accept boolean, but convert to tuple(bool,bool) for (left,right) of fields:
+        if isinstance(self.keep_whitespace, bool):
+            self.keep_whitespace = (keep_whitespace, keep_whitespace)
+        ## Ensure tuple is (bool,bool):
+        if (
+            isinstance(self.keep_whitespace, tuple)
+            and len(self.keep_whitespace) == 2
+            and isinstance(self.keep_whitespace[0], bool)
+            and isinstance(self.keep_whitespace[1], bool)
+        ):
+            # Define custom lstrip & rstrip *once*, at __init__:
+            if self.keep_whitespace[0] is True:
+                self.ltrim = lambda x: x
+            else:
+                self.ltrim = lambda x: x.lstrip(self.whitespace_chars)
+            if self.keep_whitespace[1] is True:
+                self.rtrim = lambda x: x
+            else:
+                self.rtrim = lambda x: x.rstrip(self.whitespace_chars)
+        else:
+            raise ValueError(
+                "'keep_whitespace' must be a bool or tuple(bool,bool)."
+                f"\nReceived '{type(self.keep_whitespace).__name__}': "
+                f"'{self.keep_whitespace}'."
+            )
+
     def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
         """
         Read rows from self.f, skipping as specified.
@@ -1281,8 +1320,14 @@ def __next__(self) -> list[str]:
                 line = next(self.f)  # type: ignore[arg-type]
         else:
             line = next(self.f)  # type: ignore[arg-type]
+
+        line = line.rstrip("\r\n")
+
         # Note: 'colspecs' is a sequence of half-open intervals.
-        return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
+        return [self.ltrim(self.rtrim(line[from_:to])) for (from_, to) in self.colspecs]
+
+
+#        return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
 
 
 class FixedWidthFieldParser(PythonParser):
@@ -1305,6 +1350,9 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
             self.comment,
             self.skiprows,
             self.infer_nrows,
+            ## GH51569
+            self.keep_whitespace,
+            self.whitespace_chars,
         )
 
     def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 7230c675ee775..081845dbe3357 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -435,7 +435,13 @@
     "float_precision": None,
 }
 
-_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
+_fwf_defaults = {
+    "colspecs": "infer",
+    "infer_nrows": 100,
+    "widths": None,
+    "keep_whitespace": (False, False),
+    "whitespace_chars": " \t",
+}
 
 _c_unsupported = {"skipfooter"}
 _python_unsupported = {"low_memory", "float_precision"}
@@ -1235,10 +1241,13 @@ def read_fwf(
     widths: Sequence[int] | None = None,
     infer_nrows: int = 100,
     use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
+    ## GH51569
+    keep_whitespace: bool | tuple[bool, bool] = (False, False),
+    whitespace_chars: str = " \t",
     **kwds,
 ) -> DataFrame | TextFileReader:
     r"""
-    Read a table of fixed-width formatted lines into DataFrame.
+    Read a file of fixed-width lines into DataFrame.
 
     Also supports optionally iterating or breaking of the file
     into chunks.
@@ -1266,6 +1275,8 @@ def read_fwf(
     infer_nrows : int, default 100
         The number of rows to consider when letting the parser determine the
         `colspecs`.
+    delimiter : str, default ``' '`` and ``'\t'`` characters
+        When inferring colspecs, sets the column / field separator.
     use_nullable_dtypes : bool = False
         Whether or not to use nullable dtypes as default when reading data. If
         set to True, nullable dtypes are used for all dtypes that have a nullable
@@ -1283,6 +1294,14 @@ def read_fwf(
 
         .. versionadded:: 2.0
 
+    keep_whitespace : bool, or tuple (bool,bool), default (False,False)
+        How to handle whitespace at start,end of each field / column.
+    whitespace_chars : str, default = ``' '`` and ``'\t'`` characters
+        If ``keep_whitespace`` is to remove whitespace, these characters are
+        stripped from each field / column.
+
+        .. versionadded:: 2.0
+
     **kwds : optional
         Optional keyword arguments can be passed to ``TextFileReader``.
 
@@ -1294,6 +1313,7 @@ def read_fwf(
 
     See Also
     --------
+    read_table : Read data from table (i.e. columns with delimiting spaces).
     DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
     read_csv : Read a comma-separated values (csv) file into DataFrame.
 
@@ -1346,6 +1366,9 @@ def read_fwf(
     kwds["infer_nrows"] = infer_nrows
     kwds["engine"] = "python-fwf"
     kwds["use_nullable_dtypes"] = use_nullable_dtypes
+    ## GH51569
+    kwds["keep_whitespace"] = keep_whitespace
+    kwds["whitespace_chars"] = whitespace_chars
     return _read(filepath_or_buffer, kwds)
 
 

From 40ad7db326d04957a2f2c68d21bfa3d376813443 Mon Sep 17 00:00:00 2001
From: Ronald Barnes <ron@ronaldbarnes.ca>
Date: Thu, 23 Feb 2023 01:00:40 -0800
Subject: [PATCH 2/4] DOC: Document behaviour for `keep_whitespace` and
 `whitespace_chars` options to `read_fwf`. (#51659)

Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
---
 doc/source/user_guide/io.rst | 88 ++++++++++++++++++++++++++----------
 1 file changed, 63 insertions(+), 25 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index ec082cb90e75c..259a6db0778e6 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -1006,7 +1006,7 @@ first read it in as an object dtype and then apply :func:`to_datetime` to each e
 
 .. ipython:: python
 
-   data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n")
+   data = StringIO("date\n12 Jan 2000\n2000-01-13\n")
    df = pd.read_csv(data)
    df['date'] = df['date'].apply(pd.to_datetime)
    df
@@ -1373,8 +1373,7 @@ Files with fixed width columns
 
 While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works
 with data files that have known and fixed column widths. The function parameters
-to ``read_fwf`` are largely the same as ``read_csv`` with two extra parameters, and
-a different usage of the ``delimiter`` parameter:
+to ``read_fwf`` are largely the same as ``read_csv`` with five extra parameters:
 
 * ``colspecs``: A list of pairs (tuples) giving the extents of the
   fixed-width fields of each line as half-open intervals (i.e.,  [from, to[ ).
@@ -1383,12 +1382,46 @@ a different usage of the ``delimiter`` parameter:
   behavior, if not specified, is to infer.
 * ``widths``: A list of field widths which can be used instead of 'colspecs'
   if the intervals are contiguous.
-* ``delimiter``: Characters to consider as filler characters in the fixed-width file.
-  Can be used to specify the filler character of the fields
-  if it is not spaces (e.g., '~').
+* ``keep_whitespace``: A boolean or a tuple(bool,bool) indicating how whitespace
+  at the (start,end) of each field / column should be handled.
+* ``whitespace_chars``: A string of characters to strip from the start and/or end
+  of fields / columns when 'keep_whitespace' contains a False value.
+* ``delimiter``: Character(s) separating columns when inferring 'colspecs'.
 
 Consider a typical fixed-width data file:
 
+.. ipython:: python
+
+   data = (
+      "name1     VANBCCAN 107.51  46  B  8  E  \n"
+      "name2     BBYBCCAN* 20.00  5 1  5 7  F E\n"
+      "fullname 3VICBCCAN  22.50  3    1  C   5\n"
+   )
+   df = pd.read_fwf(StringIO(data),
+      header=None,
+      widths=[10,3,2,3,1,6,3,12],
+      keep_whitespace=(True,False),
+      names=["Name", "City", "Prov", "Country", "Deleted",
+          "TransAvg", "TransCount", "CreditScores"],
+      # Do not convert field data to Nan:
+      na_filter=False,
+   )
+   df
+   df.values
+
+Note that the name field had trailing whitespace removed, as
+did the other text fields. However, the *leading* whitespace in CreditScores was
+preserved.
+
+This is due to ``keep_whitespace`` setting of (True,False) representing (start/end) and
+``whitespace_chars`` default of ``' '`` and ``'\t'`` ([space] and [tab]).
+
+The TransAvg and TransCount fields had automatic dtype conversion to
+float64 and int64 respectively.
+
+
+Parsing a table is possible (see also ``read_table``):
+
 .. ipython:: python
 
    data1 = (
@@ -1398,41 +1431,40 @@ Consider a typical fixed-width data file:
        "id1230    413.836124   184.375703   11916.8\n"
        "id1948    502.953953   173.237159   12468.3"
    )
-   with open("bar.csv", "w") as f:
-       f.write(data1)
 
-In order to parse this file into a ``DataFrame``, we simply need to supply the
-column specifications to the ``read_fwf`` function along with the file name:
+In order to parse this data set into a ``DataFrame``, we simply need to supply the
+column specifications to the ``read_fwf`` function:
 
 .. ipython:: python
 
    # Column specifications are a list of half-intervals
    colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
-   df = pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0)
+   df = pd.read_fwf(StringIO(data1),
+      colspecs=colspecs,
+      header=None,
+      index_col=0
+   )
    df
 
 Note how the parser automatically picks column names X.<column number> when
-``header=None`` argument is specified. Alternatively, you can supply just the
-column widths for contiguous columns:
-
-.. ipython:: python
-
-   # Widths are a list of integers
-   widths = [6, 14, 13, 10]
-   df = pd.read_fwf("bar.csv", widths=widths, header=None)
-   df
+``header=None`` argument is specified.
 
-The parser will take care of extra white spaces around the columns
-so it's ok to have extra separation between the columns in the file.
+The parser will take care of extra white spaces around the numeric data columns, and
+trailing spaces on string data, so it's ok to have extra separation between the columns
+in the file.
 
 By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the
 first 100 rows of the file. It can do it only in cases when the columns are
 aligned and correctly separated by the provided ``delimiter`` (default delimiter
 is whitespace).
 
+
 .. ipython:: python
 
-   df = pd.read_fwf("bar.csv", header=None, index_col=0)
+   df = pd.read_fwf(StringIO(data1),
+      header=None,
+      index_col=0
+   )
    df
 
 ``read_fwf`` supports the ``dtype`` parameter for specifying the types of
@@ -1440,10 +1472,16 @@ parsed columns to be different from the inferred type.
 
 .. ipython:: python
 
-   pd.read_fwf("bar.csv", header=None, index_col=0).dtypes
-   pd.read_fwf("bar.csv", header=None, dtype={2: "object"}).dtypes
+   pd.read_fwf(StringIO(data1),
+      header=None,
+      index_col=0).dtypes
+
+   pd.read_fwf(StringIO(data1),
+      header=None,
+      dtype={2: "object"}).dtypes
 
 .. ipython:: python
+   :okexcept:
    :suppress:
 
    os.remove("bar.csv")

From 00066375194135ab6012a4b961597116e0fe610b Mon Sep 17 00:00:00 2001
From: Ronald Barnes <ron@ronaldbarnes.ca>
Date: Thu, 23 Feb 2023 01:01:47 -0800
Subject: [PATCH 3/4] TST: Integration tests for `keep_whitespace` option of
 `read_fwf`. (#51569)

Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
---
 pandas/tests/io/parser/test_read_fwf.py | 52 ++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
index c2939f7c12f10..1b33648194b5b 100644
--- a/pandas/tests/io/parser/test_read_fwf.py
+++ b/pandas/tests/io/parser/test_read_fwf.py
@@ -633,8 +633,15 @@ def test_whitespace_preservation():
     fwf_data = """
  a bbb
  ccdd """
+    ## This test is a mess:
+    ## It's trying to keep whitespace via passing in a non-space delimiter:
     result = read_fwf(
-        StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t"
+        StringIO(fwf_data),
+        widths=[3, 3],
+        header=header,
+        skiprows=[0],
+        # delimiter="\n\t",
+        keep_whitespace=True,
     )
     expected = read_csv(StringIO(csv_data), header=header)
     tm.assert_frame_equal(result, expected)
@@ -1004,3 +1011,46 @@ def test_use_nullable_dtypes_option():
 
     expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")})
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "keep_whitespace, data, expected",
+    [
+        (
+            # Preserve all whitespace:
+            True,
+            # 10-byte wide fields:
+            ["left      ", "  centre  ", "    right "],
+            DataFrame(["left      ", "  centre  ", "    right "]),
+        ),
+        (
+            # Preserve no whitespace:
+            False,
+            # 10-byte wide fields:
+            ["left      ", "  centre  ", "    right "],
+            DataFrame(["left", "centre", "right"]),
+        ),
+        # Preserve leading whitespace only:
+        (
+            (True, False),
+            ["left      ", "  centre  ", "    right"],
+            DataFrame(["left", "  centre", "    right"]),
+        ),
+        # Preserve trailing whitespace only:
+        (
+            (False, True),
+            ["left      ", "  centre  ", "    right"],
+            DataFrame(["left      ", "centre  ", "right"]),
+        ),
+    ],
+)
+def test_fwf_keep_whitespace_true(keep_whitespace, data, expected):
+    # see GH#####
+
+    result = read_fwf(
+        StringIO("\n".join(data)),
+        header=None,
+        widths=[10],
+        keep_whitespace=keep_whitespace,
+    )
+    tm.assert_frame_equal(result, expected)

From 65d444dc07a164a3aa4256ecb892d431d4774408 Mon Sep 17 00:00:00 2001
From: Ronald Barnes <ron@ronaldbarnes.ca>
Date: Thu, 23 Feb 2023 01:17:34 -0800
Subject: [PATCH 4/4] DOC: What's New for v2.0.0: `keep_whitespace` and
 `whitespace_chars` arguments for `read_fwf`. (#51569)

Signed-off-by: Ronald Barnes <ron@ronaldbarnes.ca>
---
 doc/source/whatsnew/v2.0.0.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 29f360e050548..ebf89a201a759 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -312,6 +312,7 @@ Other enhancements
 - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
 - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
 - Added support for SQLAlchemy 2.0 (:issue:`40686`)
+- Added new arguments ``keep_whitespace`` and ``whitespace_chars`` to :func:`read_fwf` giving more control and more intuitive control over whitespace handling (:issue:`51569`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -829,8 +830,8 @@ Deprecations
 - Deprecated :meth:`Series.backfill` in favor of :meth:`Series.bfill` (:issue:`33396`)
 - Deprecated :meth:`DataFrame.pad` in favor of :meth:`DataFrame.ffill` (:issue:`33396`)
 - Deprecated :meth:`DataFrame.backfill` in favor of :meth:`DataFrame.bfill` (:issue:`33396`)
+- Deprecated using ``delimiter`` option to ``read_fwf`` to preserve whitespace in favour of ``keep_whitespace`` and ``whitespace_chars`` (:issue:`51569`)
 -
-
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.prior_deprecations: