From 25bf5838452e23943a89a80e7fa1c03e50040928 Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Thu, 23 Feb 2023 00:55:53 -0800 Subject: [PATCH 1/4] ENH: Added `keep_whitespace` and `whitespace_chars` to `read_fwf`, allowing more control over handling of whitespace in fields and removing the requirement to specify a `delimiter` in order to preserve whitespace. (#51569) Signed-off-by: Ronald Barnes --- pandas/io/parsers/python_parser.py | 50 +++++++++++++++++++++++++++++- pandas/io/parsers/readers.py | 27 ++++++++++++++-- 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 62a4e80147780..e1679bbdcf6b2 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -104,6 +104,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.decimal = kwds["decimal"] self.comment = kwds["comment"] + ## GH51569 + self.keep_whitespace = kwds.get("keep_whitespace") + self.whitespace_chars = kwds.get("whitespace_chars") # Set self.data to something that can read lines. if isinstance(f, list): @@ -1180,11 +1183,20 @@ def __init__( comment: str | None, skiprows: set[int] | None = None, infer_nrows: int = 100, + ## GH51569 + keep_whitespace: bool | tuple[bool, bool] = (False, False), + whitespace_chars: str = " \t", ) -> None: self.f = f self.buffer: Iterator | None = None self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " self.comment = comment + self.keep_whitespace = keep_whitespace + ## Backwards compatibility means supporting delimiter: + if delimiter: + whitespace_chars = whitespace_chars + delimiter + self.whitespace_chars = whitespace_chars + if colspecs == "infer": self.colspecs = self.detect_colspecs( infer_nrows=infer_nrows, skiprows=skiprows @@ -1210,6 +1222,33 @@ def __init__( "2 element tuple or list of integers" ) + ## GH51569 + ## Accept boolean, but convert to tuple(bool,bool) for (left,right) of fields: + if isinstance(self.keep_whitespace, bool): + self.keep_whitespace = (keep_whitespace, keep_whitespace) + ## Ensure tuple is (bool,bool): + if ( + isinstance(self.keep_whitespace, tuple) + and len(self.keep_whitespace) == 2 + and isinstance(self.keep_whitespace[0], bool) + and isinstance(self.keep_whitespace[1], bool) + ): + # Define custom lstrip & rstrip *once*, at __init__: + if self.keep_whitespace[0] is True: + self.ltrim = lambda x: x + else: + self.ltrim = lambda x: x.lstrip(self.whitespace_chars) + if self.keep_whitespace[1] is True: + self.rtrim = lambda x: x + else: + self.rtrim = lambda x: x.rstrip(self.whitespace_chars) + else: + raise ValueError( + "'keep_whitespace' must be a bool or tuple(bool,bool)." + f"\nReceived '{type(self.keep_whitespace).__name__}': " + f"'{self.keep_whitespace}'." + ) + def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]: """ Read rows from self.f, skipping as specified. @@ -1281,8 +1320,14 @@ def __next__(self) -> list[str]: line = next(self.f) # type: ignore[arg-type] else: line = next(self.f) # type: ignore[arg-type] + + line = line.rstrip("\r\n") + # Note: 'colspecs' is a sequence of half-open intervals. - return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs] + return [self.ltrim(self.rtrim(line[from_:to])) for (from_, to) in self.colspecs] + + +# return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs] class FixedWidthFieldParser(PythonParser): @@ -1305,6 +1350,9 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: self.comment, self.skiprows, self.infer_nrows, + ## GH51569 + self.keep_whitespace, + self.whitespace_chars, ) def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7230c675ee775..081845dbe3357 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -435,7 +435,13 @@ "float_precision": None, } -_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} +_fwf_defaults = { + "colspecs": "infer", + "infer_nrows": 100, + "widths": None, + "keep_whitespace": (False, False), + "whitespace_chars": " \t", +} _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} @@ -1235,10 +1241,13 @@ def read_fwf( widths: Sequence[int] | None = None, infer_nrows: int = 100, use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + ## GH51569 + keep_whitespace: bool | tuple[bool, bool] = (False, False), + whitespace_chars: str = " \t", **kwds, ) -> DataFrame | TextFileReader: r""" - Read a table of fixed-width formatted lines into DataFrame. + Read a file of fixed-width lines into DataFrame. Also supports optionally iterating or breaking of the file into chunks. @@ -1266,6 +1275,8 @@ def read_fwf( infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`. + delimiter : str, default ``' '`` and ``'\t'`` characters + When inferring colspecs, sets the column / field separator. use_nullable_dtypes : bool = False Whether or not to use nullable dtypes as default when reading data. If set to True, nullable dtypes are used for all dtypes that have a nullable @@ -1283,6 +1294,14 @@ def read_fwf( .. versionadded:: 2.0 + keep_whitespace : bool, or tuple (bool,bool), default (False,False) + How to handle whitespace at start,end of each field / column. + whitespace_chars : str, default = ``' '`` and ``'\t'`` characters + If ``keep_whitespace`` is to remove whitespace, these characters are + stripped from each field / column. + + .. versionadded:: 2.0 + **kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. @@ -1294,6 +1313,7 @@ def read_fwf( See Also -------- + read_table : Read data from table (i.e. columns with delimiting spaces). DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. read_csv : Read a comma-separated values (csv) file into DataFrame. @@ -1346,6 +1366,9 @@ def read_fwf( kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" kwds["use_nullable_dtypes"] = use_nullable_dtypes + ## GH51569 + kwds["keep_whitespace"] = keep_whitespace + kwds["whitespace_chars"] = whitespace_chars return _read(filepath_or_buffer, kwds) From 40ad7db326d04957a2f2c68d21bfa3d376813443 Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Thu, 23 Feb 2023 01:00:40 -0800 Subject: [PATCH 2/4] DOC: Document behaviour for `keep_whitespace` and `whitespace_chars` options to `read_fwf`. (#51659) Signed-off-by: Ronald Barnes --- doc/source/user_guide/io.rst | 88 ++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ec082cb90e75c..259a6db0778e6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1006,7 +1006,7 @@ first read it in as an object dtype and then apply :func:`to_datetime` to each e .. ipython:: python - data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n") + data = StringIO("date\n12 Jan 2000\n2000-01-13\n") df = pd.read_csv(data) df['date'] = df['date'].apply(pd.to_datetime) df @@ -1373,8 +1373,7 @@ Files with fixed width columns While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters -to ``read_fwf`` are largely the same as ``read_csv`` with two extra parameters, and -a different usage of the ``delimiter`` parameter: +to ``read_fwf`` are largely the same as ``read_csv`` with five extra parameters: * ``colspecs``: A list of pairs (tuples) giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). @@ -1383,12 +1382,46 @@ a different usage of the ``delimiter`` parameter: behavior, if not specified, is to infer. * ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. -* ``delimiter``: Characters to consider as filler characters in the fixed-width file. - Can be used to specify the filler character of the fields - if it is not spaces (e.g., '~'). +* ``keep_whitespace``: A boolean or a tuple(bool,bool) indicating how whitespace + at the (start,end) of each field / column should be handled. +* ``whitespace_chars``: A string of characters to strip from the start and/or end + of fields / columns when 'keep_whitespace' contains a False value. +* ``delimiter``: Character(s) separating columns when inferring 'colspecs'. Consider a typical fixed-width data file: +.. ipython:: python + + data = ( + "name1 VANBCCAN 107.51 46 B 8 E \n" + "name2 BBYBCCAN* 20.00 5 1 5 7 F E\n" + "fullname 3VICBCCAN 22.50 3 1 C 5\n" + ) + df = pd.read_fwf(StringIO(data), + header=None, + widths=[10,3,2,3,1,6,3,12], + keep_whitespace=(True,False), + names=["Name", "City", "Prov", "Country", "Deleted", + "TransAvg", "TransCount", "CreditScores"], + # Do not convert field data to Nan: + na_filter=False, + ) + df + df.values + +Note that the name field had trailing whitespace removed, as +did the other text fields. However, the *leading* whitespace in CreditScores was +preserved. + +This is due to ``keep_whitespace`` setting of (True,False) representing (start/end) and +``whitespace_chars`` default of ``' '`` and ``'\t'`` ([space] and [tab]). + +The TransAvg and TransCount fields had automatic dtype conversion to +float64 and int64 respectively. + + +Parsing a table is possible (see also ``read_table``): + .. ipython:: python data1 = ( @@ -1398,41 +1431,40 @@ Consider a typical fixed-width data file: "id1230 413.836124 184.375703 11916.8\n" "id1948 502.953953 173.237159 12468.3" ) - with open("bar.csv", "w") as f: - f.write(data1) -In order to parse this file into a ``DataFrame``, we simply need to supply the -column specifications to the ``read_fwf`` function along with the file name: +In order to parse this data set into a ``DataFrame``, we simply need to supply the +column specifications to the ``read_fwf`` function: .. ipython:: python # Column specifications are a list of half-intervals colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] - df = pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0) + df = pd.read_fwf(StringIO(data1), + colspecs=colspecs, + header=None, + index_col=0 + ) df Note how the parser automatically picks column names X. when -``header=None`` argument is specified. Alternatively, you can supply just the -column widths for contiguous columns: - -.. ipython:: python - - # Widths are a list of integers - widths = [6, 14, 13, 10] - df = pd.read_fwf("bar.csv", widths=widths, header=None) - df +``header=None`` argument is specified. -The parser will take care of extra white spaces around the columns -so it's ok to have extra separation between the columns in the file. +The parser will take care of extra white spaces around the numeric data columns, and +trailing spaces on string data, so it's ok to have extra separation between the columns +in the file. By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the first 100 rows of the file. It can do it only in cases when the columns are aligned and correctly separated by the provided ``delimiter`` (default delimiter is whitespace). + .. ipython:: python - df = pd.read_fwf("bar.csv", header=None, index_col=0) + df = pd.read_fwf(StringIO(data1), + header=None, + index_col=0 + ) df ``read_fwf`` supports the ``dtype`` parameter for specifying the types of @@ -1440,10 +1472,16 @@ parsed columns to be different from the inferred type. .. ipython:: python - pd.read_fwf("bar.csv", header=None, index_col=0).dtypes - pd.read_fwf("bar.csv", header=None, dtype={2: "object"}).dtypes + pd.read_fwf(StringIO(data1), + header=None, + index_col=0).dtypes + + pd.read_fwf(StringIO(data1), + header=None, + dtype={2: "object"}).dtypes .. ipython:: python + :okexcept: :suppress: os.remove("bar.csv") From 00066375194135ab6012a4b961597116e0fe610b Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Thu, 23 Feb 2023 01:01:47 -0800 Subject: [PATCH 3/4] TST: Integration tests for `keep_whitespace` option of `read_fwf`. (#51569) Signed-off-by: Ronald Barnes --- pandas/tests/io/parser/test_read_fwf.py | 52 ++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index c2939f7c12f10..1b33648194b5b 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -633,8 +633,15 @@ def test_whitespace_preservation(): fwf_data = """ a bbb ccdd """ + ## This test is a mess: + ## It's trying to keep whitespace via passing in a non-space delimiter: result = read_fwf( - StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" + StringIO(fwf_data), + widths=[3, 3], + header=header, + skiprows=[0], + # delimiter="\n\t", + keep_whitespace=True, ) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) @@ -1004,3 +1011,46 @@ def test_use_nullable_dtypes_option(): expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "keep_whitespace, data, expected", + [ + ( + # Preserve all whitespace: + True, + # 10-byte wide fields: + ["left ", " centre ", " right "], + DataFrame(["left ", " centre ", " right "]), + ), + ( + # Preserve no whitespace: + False, + # 10-byte wide fields: + ["left ", " centre ", " right "], + DataFrame(["left", "centre", "right"]), + ), + # Preserve leading whitespace only: + ( + (True, False), + ["left ", " centre ", " right"], + DataFrame(["left", " centre", " right"]), + ), + # Preserve trailing whitespace only: + ( + (False, True), + ["left ", " centre ", " right"], + DataFrame(["left ", "centre ", "right"]), + ), + ], +) +def test_fwf_keep_whitespace_true(keep_whitespace, data, expected): + # see GH##### + + result = read_fwf( + StringIO("\n".join(data)), + header=None, + widths=[10], + keep_whitespace=keep_whitespace, + ) + tm.assert_frame_equal(result, expected) From 65d444dc07a164a3aa4256ecb892d431d4774408 Mon Sep 17 00:00:00 2001 From: Ronald Barnes Date: Thu, 23 Feb 2023 01:17:34 -0800 Subject: [PATCH 4/4] DOC: What's New for v2.0.0: `keep_whitespace` and `whitespace_chars` arguments for `read_fwf`. (#51569) Signed-off-by: Ronald Barnes --- doc/source/whatsnew/v2.0.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 29f360e050548..ebf89a201a759 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -312,6 +312,7 @@ Other enhancements - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) - Added support for SQLAlchemy 2.0 (:issue:`40686`) +- Added new arguments ``keep_whitespace`` and ``whitespace_chars`` to :func:`read_fwf` giving more control and more intuitive control over whitespace handling (:issue:`51569`) - .. --------------------------------------------------------------------------- @@ -829,8 +830,8 @@ Deprecations - Deprecated :meth:`Series.backfill` in favor of :meth:`Series.bfill` (:issue:`33396`) - Deprecated :meth:`DataFrame.pad` in favor of :meth:`DataFrame.ffill` (:issue:`33396`) - Deprecated :meth:`DataFrame.backfill` in favor of :meth:`DataFrame.bfill` (:issue:`33396`) +- Deprecated using ``delimiter`` option to ``read_fwf`` to preserve whitespace in favour of ``keep_whitespace`` and ``whitespace_chars`` (:issue:`51569`) - - .. --------------------------------------------------------------------------- .. _whatsnew_200.prior_deprecations: