From 2bfa90a35c18ca6a6a71f78a920863c50f78477e Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Mon, 21 Nov 2022 21:30:07 -0800 Subject: [PATCH 1/5] Updated documentation indicating default behaviour is to strip whitespace, and how to override. Enhances GH-issue-16950 https://github.com/pandas-dev/pandas/pull/16950 --- doc/source/user_guide/io.rst | 9 ++++++--- pandas/io/parsers/readers.py | 6 +++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b4bf3ef024d4c..f9637d13da0be 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1366,8 +1366,10 @@ a different usage of the ``delimiter`` parameter: * ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. * ``delimiter``: Characters to consider as filler characters in the fixed-width file. - Can be used to specify the filler character of the fields - if it is not spaces (e.g., '~'). + Default is "`` \t``" (space and tab). + Used to specify the character(s) to strip from start and end of every field. + To preserve whitespace, set to a character that does not exist in the data, + i.e. "\0". Consider a typical fixed-width data file: @@ -1404,8 +1406,9 @@ column widths for contiguous columns: df = pd.read_fwf("bar.csv", widths=widths, header=None) df -The parser will take care of extra white spaces around the columns +The parser will take care of extra whitespace around the columns, so it's ok to have extra separation between the columns in the file. +To preserve whitespace around the columns, see ``delimiter``. By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the first 100 rows of the file. It can do it only in cases when the columns are diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 700a2b6ba964c..ac909ebc21960 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1231,6 +1231,7 @@ def read_fwf( *, colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, + delimiter: str | None = " \t", infer_nrows: int = 100, **kwds, ) -> DataFrame | TextFileReader: @@ -1251,7 +1252,7 @@ def read_fwf( Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.csv``. - colspecs : list of tuple (int, int) or 'infer'. optional + colspecs : list of tuple (int, int) or 'infer', optional A list of tuples giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). String value 'infer' can be used to instruct the parser to try @@ -1260,6 +1261,9 @@ def read_fwf( widths : list of int, optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. + delimiter : str, default " \t" (space and tab), optional + Character(s) to strip from start and end of each field. To + preserve whitespace, must be non-default value (i.e. delimiter="\0"). infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`. From f297d9904565231f59753883c2ecd267a1f133e5 Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Mon, 21 Nov 2022 21:52:33 -0800 Subject: [PATCH 2/5] Fix failed Sphinx lint issue. --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f9637d13da0be..a8b58bebbc4b5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1366,7 +1366,7 @@ a different usage of the ``delimiter`` parameter: * ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. * ``delimiter``: Characters to consider as filler characters in the fixed-width file. - Default is "`` \t``" (space and tab). + Default are space and tab characters. Used to specify the character(s) to strip from start and end of every field. To preserve whitespace, set to a character that does not exist in the data, i.e. "\0". From a0304a7c0504c4d4b1cdcfefa3bbd528e6fefc5c Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Mon, 21 Nov 2022 22:09:48 -0800 Subject: [PATCH 3/5] Added delimiter to _fwf_defaults. --- pandas/io/parsers/readers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ac909ebc21960..0a6175cc65c1e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -435,7 +435,12 @@ "float_precision": None, } -_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} +_fwf_defaults = { + "colspecs": "infer", + "infer_nrows": 100, + "widths": None, + "delimiter": " ", +} _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} From 7adb89d405ed7803952ff0a2452a99c37115c0ea Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Mon, 21 Nov 2022 23:22:11 -0800 Subject: [PATCH 4/5] Changed comment from ## to # per flake8. --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 0a6175cc65c1e..b9e2faf6e57b7 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -439,7 +439,7 @@ "colspecs": "infer", "infer_nrows": 100, "widths": None, - "delimiter": " ", + "delimiter": " ", # space & [TAB] } _c_unsupported = {"skipfooter"} From ab111c7549948efeba7f9a3042dac1cfcf6284be Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Wed, 23 Nov 2022 00:28:05 -0800 Subject: [PATCH 5/5] Delimiters used by colspecs='infer' --- pandas/io/parsers/readers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6a2f9c9baf24b..defe396813d5a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1274,6 +1274,7 @@ def read_fwf( delimiter : str, default " \t" (space and tab), optional Character(s) to strip from start and end of each field. To preserve whitespace, must be non-default value (i.e. delimiter="\0"). + Used by `colspecs="infer"` to determine column boundaries. infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`.