From 6b36015df7532d959589759a9bc5a67324360439 Mon Sep 17 00:00:00 2001 From: Christopher Roberts Date: Sat, 4 Aug 2018 22:47:40 -0500 Subject: [PATCH] BUG: Infer compression by default in read_fwf() Closes gh-22199. --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/io/parsers.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 14 ++++++++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a2abda019812a..d41293d7655a0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1528,6 +1528,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) - Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) +- Bug in :func:`read_fwf` in which the compression type of a file was not being properly inferred (:issue:`22199`) - Bug in :func:`pandas.io.json.json_normalize` that caused it to raise ``TypeError`` when two consecutive elements of ``record_path`` are dicts (:issue:`22706`) - Bug in :meth:`DataFrame.to_stata`, :class:`pandas.io.stata.StataWriter` and :class:`pandas.io.stata.StataWriter117` where a exception would leave a partially written and invalid dta file (:issue:`23573`) - Bug in :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` that produced invalid files when using strLs with non-ASCII characters (:issue:`23573`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aadca1fcb3bef..926d889bf8f91 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -401,7 +401,7 @@ def _read(filepath_or_buffer, kwds): encoding = re.sub('_', '-', encoding).lower() kwds['encoding'] = encoding - compression = kwds.get('compression') + compression = kwds.get('compression', 'infer') compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( filepath_or_buffer, encoding, compression) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index e8c5b37579d71..172bbe0bad4c7 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -555,20 +555,26 @@ def test_default_delimiter(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("compression", ["gzip", "bz2"]) -def test_fwf_compression(compression): +@pytest.mark.parametrize("infer", [True, False, None]) +def test_fwf_compression(compression_only, infer): data = """1111111111 2222222222 3333333333""".strip() + compression = compression_only + extension = "gz" if compression == "gzip" else compression + kwargs = dict(widths=[5, 5], names=["one", "two"]) expected = read_fwf(StringIO(data), **kwargs) if compat.PY3: data = bytes(data, encoding="utf-8") - with tm.ensure_clean() as path: + with tm.ensure_clean(filename="tmp." + extension) as path: tm.write_to_compressed(compression, path, data) - result = read_fwf(path, compression=compression, **kwargs) + if infer is not None: + kwargs["compression"] = "infer" if infer else compression + + result = read_fwf(path, **kwargs) tm.assert_frame_equal(result, expected)