From ec0a292919f5ff4aabe54c3bef322c6f7caac42e Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 9 Aug 2017 14:13:13 -0400 Subject: [PATCH 1/7] Test: infer compression from pathlib.Path --- pandas/tests/io/parser/compression.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 797c12139656d..326fbf99e7af0 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -157,6 +157,21 @@ def test_read_csv_infer_compression(self): inputs[3].close() + def test_read_csv_infer_compression_pathlib(self): + """ + Test that compression is inferred from pathlib.Path paths. + """ + try: + import pathlib + except ImportError: + pytest.skip('need pathlib to run') + expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) + for extension in '', '.gz', '.bz2': + path = pathlib.Path(self.csv1 + extension) + df = self.read_csv( + path, index_col=0, parse_dates=True, compression='infer') + tm.assert_frame_equal(expected, df) + def test_invalid_compression(self): msg = 'Unrecognized compression type: sfark' with tm.assert_raises_regex(ValueError, msg): From 64d55c07dbf3d8650c6b8a2e7d8c44fab2efb83d Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 9 Aug 2017 14:49:20 -0400 Subject: [PATCH 2/7] Infer compression from non-string paths --- pandas/io/common.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index cbfc33dbebb81..69a7e69ea724b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -272,13 +272,15 @@ def _infer_compression(filepath_or_buffer, compression): if compression is None: return None - # Cannot infer compression of a buffer. Hence assume no compression. - is_path = isinstance(filepath_or_buffer, compat.string_types) - if compression == 'infer' and not is_path: - return None - - # Infer compression from the filename/URL extension + # Infer compression if compression == 'infer': + # Convert all path types (e.g. pathlib.Path) to strings + filepath_or_buffer = _stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, compat.string_types): + # Cannot infer compression of a buffer, assume no compression + return None + + # Infer compression from the filename/URL extension for compression, extension in _compression_to_extension.items(): if filepath_or_buffer.endswith(extension): return compression From 5272b9e95eec9c6f6dc8e6551e18f576d4f088a5 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 9 Aug 2017 16:11:03 -0400 Subject: [PATCH 3/7] Documentation updates --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/parsers.py | 10 +++++----- pandas/io/pickle.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b8f142700b830..f7a8bd4e168a7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -126,6 +126,7 @@ Other Enhancements - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) +- :func:`read_csv` can now infer compression from non-string paths, such as a ``pathlab.Path`` objects (:issue:`17206`). .. _whatsnew_0210.api_breaking: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9c76d3126890c..05a04f268f72b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -208,11 +208,11 @@ `_ for more information on ``iterator`` and ``chunksize``. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', - '.zip', or 'xz', respectively, and no decompression otherwise. If using - 'zip', the ZIP file must contain only one data file to be read in. - Set to None for no decompression. + For on-the-fly decompression of on-disk data. If 'infer' and + `filepath_or_buffer` is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + decompression). If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6f345092c514d..143b76575e36b 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -62,8 +62,8 @@ def read_pickle(path, compression='infer'): File path compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz', - or 'zip' respectively, and no decompression otherwise. + gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', + or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 From e3d4d9ad19476c5cf23367120560f31413e11c4a Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Thu, 10 Aug 2017 10:12:30 -0400 Subject: [PATCH 4/7] Address @gfyoung reviews https://github.com/pandas-dev/pandas/pull/17206#pullrequestreview-55411442 https://github.com/pandas-dev/pandas/pull/17206#pullrequestreview-55411536 https://github.com/pandas-dev/pandas/pull/17206#discussion_r132341564 --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/tests/io/parser/compression.py | 19 ++++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f7a8bd4e168a7..1f20550852f23 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -126,7 +126,7 @@ Other Enhancements - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) -- :func:`read_csv` can now infer compression from non-string paths, such as a ``pathlab.Path`` objects (:issue:`17206`). +- :func:`read_csv` can now infer compression from non-string paths, such as a ``pathlib.Path`` objects (:issue:`17206`). .. _whatsnew_0210.api_breaking: diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 326fbf99e7af0..375aab5e17547 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -157,20 +157,17 @@ def test_read_csv_infer_compression(self): inputs[3].close() - def test_read_csv_infer_compression_pathlib(self): + @pytest.mark.parametrize('extension', ['', '.gz', '.bz2']) + def test_read_csv_infer_compression_pathlib(self, extension): """ Test that compression is inferred from pathlib.Path paths. """ - try: - import pathlib - except ImportError: - pytest.skip('need pathlib to run') - expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) - for extension in '', '.gz', '.bz2': - path = pathlib.Path(self.csv1 + extension) - df = self.read_csv( - path, index_col=0, parse_dates=True, compression='infer') - tm.assert_frame_equal(expected, df) + pathlib = pytest.importorskip('pathlib') + read_csv_kwargs = {'index_col': 0, 'parse_dates': True} + expected = self.read_csv(self.csv1, **read_csv_kwargs) + path = pathlib.Path(self.csv1 + extension) + df = self.read_csv(path, compression='infer', **read_csv_kwargs) + tm.assert_frame_equal(expected, df) def test_invalid_compression(self): msg = 'Unrecognized compression type: sfark' From 8fcf3988e6d9193d96ccefbfec88d4e1fff91b4f Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Thu, 10 Aug 2017 10:25:23 -0400 Subject: [PATCH 5/7] Generalize What's New entry https://github.com/pandas-dev/pandas/pull/17206#pullrequestreview-55495655 --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1f20550852f23..a6742fedc580d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -126,7 +126,7 @@ Other Enhancements - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) -- :func:`read_csv` can now infer compression from non-string paths, such as a ``pathlib.Path`` objects (:issue:`17206`). +- `read_*` methods can now infer compression from non-string paths, such as a ``pathlib.Path`` objects (:issue:`17206`). .. _whatsnew_0210.api_breaking: From 0f925c15d6d8a3c2248139f07fef3bb61385fecd Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Tue, 15 Aug 2017 14:16:38 -0400 Subject: [PATCH 6/7] Test _infer_compression in io/test_common.py --- pandas/tests/io/parser/compression.py | 12 --------- pandas/tests/io/test_common.py | 38 ++++++++++++++++++++------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 375aab5e17547..797c12139656d 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -157,18 +157,6 @@ def test_read_csv_infer_compression(self): inputs[3].close() - @pytest.mark.parametrize('extension', ['', '.gz', '.bz2']) - def test_read_csv_infer_compression_pathlib(self, extension): - """ - Test that compression is inferred from pathlib.Path paths. - """ - pathlib = pytest.importorskip('pathlib') - read_csv_kwargs = {'index_col': 0, 'parse_dates': True} - expected = self.read_csv(self.csv1, **read_csv_kwargs) - path = pathlib.Path(self.csv1 + extension) - df = self.read_csv(path, compression='infer', **read_csv_kwargs) - tm.assert_frame_equal(expected, df) - def test_invalid_compression(self): msg = 'Unrecognized compression type: sfark' with tm.assert_raises_regex(ValueError, msg): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b527e3c5dc254..30904593fedc4 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -14,16 +14,6 @@ from pandas import read_csv, concat -try: - from pathlib import Path -except ImportError: - pass - -try: - from py.path import local as LocalPath -except ImportError: - pass - class CustomFSPath(object): """For testing fspath on unknown objects""" @@ -34,6 +24,21 @@ def __fspath__(self): return self.path +# Functions that consume a string path and return a string or path-like object +path_types = [str, CustomFSPath] + +try: + from pathlib import Path + path_types.append(Path) +except ImportError: + pass + +try: + from py.path import local as LocalPath + path_types.append(LocalPath) +except ImportError: + pass + HERE = os.path.dirname(__file__) @@ -83,6 +88,19 @@ def test_stringify_path_fspath(self): result = common._stringify_path(p) assert result == 'foo/bar.csv' + @pytest.mark.parametrize('extension,expected', [ + ('', None), + ('.gz', 'gzip'), + ('.bz2', 'bz2'), + ('.zip', 'zip'), + ('.xz', 'xz'), + ]) + @pytest.mark.parametrize('path_type', path_types) + def test_infer_compression_from_path(self, extension, expected, path_type): + path = path_type('foo/bar.csv' + extension) + compression = common._infer_compression(path, compression='infer') + assert compression == expected + def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) From 8a15074f3e26e92608b0eaebf15c2ddf693b1778 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Tue, 15 Aug 2017 14:31:11 -0400 Subject: [PATCH 7/7] fixup! Generalize What's New entry --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index a6742fedc580d..4032a7d22d4a2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -126,7 +126,7 @@ Other Enhancements - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) -- `read_*` methods can now infer compression from non-string paths, such as a ``pathlib.Path`` objects (:issue:`17206`). +- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). .. _whatsnew_0210.api_breaking: