diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 681139fb51272..1f91953e2448e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -176,6 +176,7 @@ Other enhancements - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, :class:`SettingWithCopyWarning`, :class:`NumExprClobberingError`, :class:`UndefinedVariableError` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) +- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 0ed853d619d4e..a992c1af5ddaf 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -26,6 +26,7 @@ import numpy as np from pandas._typing import ( + CompressionOptions, FilePath, ReadBuffer, ) @@ -168,6 +169,7 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, + compression: CompressionOptions = "infer", ) -> None: self.index = index @@ -195,7 +197,9 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self.handles = get_handle(path_or_buf, "rb", is_text=False) + self.handles = get_handle( + path_or_buf, "rb", is_text=False, compression=compression + ) self._path_or_buf = self.handles.handle diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index a64ade2b3c77c..db09983cacfbc 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -17,6 +17,7 @@ import numpy as np from pandas._typing import ( + CompressionOptions, DatetimeNaTType, FilePath, ReadBuffer, @@ -256,6 +257,7 @@ def __init__( index=None, encoding: str | None = "ISO-8859-1", chunksize=None, + compression: CompressionOptions = "infer", ) -> None: self._encoding = encoding @@ -264,7 +266,11 @@ def __init__( self._chunksize = chunksize self.handles = get_handle( - filepath_or_buffer, "rb", encoding=encoding, is_text=False + filepath_or_buffer, + "rb", + encoding=encoding, + is_text=False, + compression=compression, ) self.filepath_or_buffer = self.handles.handle diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index f50fc777f55e9..ff50df886e627 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -14,9 +14,16 @@ ) from pandas._typing import ( + CompressionOptions, FilePath, ReadBuffer, ) +from pandas.util._decorators import ( + deprecate_nonkeyword_arguments, + doc, +) + +from pandas.core.shared_docs import _shared_docs from pandas.io.common import stringify_path @@ -53,6 +60,7 @@ def read_sas( encoding: str | None = ..., chunksize: int = ..., iterator: bool = ..., + compression: CompressionOptions = ..., ) -> ReaderBase: ... @@ -65,10 +73,15 @@ def read_sas( encoding: str | None = ..., chunksize: None = ..., iterator: bool = ..., + compression: CompressionOptions = ..., ) -> DataFrame | ReaderBase: ... +@deprecate_nonkeyword_arguments( + version=None, allowed_args=["filepath_or_buffer"], stacklevel=2 +) +@doc(decompression_options=_shared_docs["decompression_options"]) def read_sas( filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = None, @@ -76,6 +89,7 @@ def read_sas( encoding: str | None = None, chunksize: int | None = None, iterator: bool = False, + compression: CompressionOptions = "infer", ) -> DataFrame | ReaderBase: """ Read SAS files stored as either XPORT or SAS7BDAT format files. @@ -88,7 +102,7 @@ def read_sas( Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. - format : str {'xport', 'sas7bdat'} or None + format : str {{'xport', 'sas7bdat'}} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None @@ -107,6 +121,7 @@ def read_sas( .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. + {decompression_options} Returns ------- @@ -122,12 +137,14 @@ def read_sas( if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() - if fname.endswith(".xpt"): + if ".xpt" in fname: format = "xport" - elif fname.endswith(".sas7bdat"): + elif ".sas7bdat" in fname: format = "sas7bdat" else: - raise ValueError("unable to infer format of SAS file") + raise ValueError( + f"unable to infer format of SAS file from filename: {repr(fname)}" + ) reader: ReaderBase if format.lower() == "xport": @@ -138,6 +155,7 @@ def read_sas( index=index, encoding=encoding, chunksize=chunksize, + compression=compression, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader @@ -147,6 +165,7 @@ def read_sas( index=index, encoding=encoding, chunksize=chunksize, + compression=compression, ) else: raise ValueError("unknown SAS format") diff --git a/pandas/tests/io/sas/data/airline.sas7bdat.gz b/pandas/tests/io/sas/data/airline.sas7bdat.gz new file mode 100644 index 0000000000000..7b56e492295f4 Binary files /dev/null and b/pandas/tests/io/sas/data/airline.sas7bdat.gz differ diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py index 5d2643c20ceb2..1e38baf4fc409 100644 --- a/pandas/tests/io/sas/test_sas.py +++ b/pandas/tests/io/sas/test_sas.py @@ -20,7 +20,15 @@ def test_sas_buffer_format(self): def test_sas_read_no_format_or_extension(self): # see gh-24548 - msg = "unable to infer format of SAS file" + msg = "unable to infer format of SAS file.+" with tm.ensure_clean("test_file_no_extension") as path: with pytest.raises(ValueError, match=msg): read_sas(path) + + +def test_sas_archive(datapath): + fname_uncompressed = datapath("io", "sas", "data", "airline.sas7bdat") + df_uncompressed = read_sas(fname_uncompressed) + fname_compressed = datapath("io", "sas", "data", "airline.sas7bdat.gz") + df_compressed = read_sas(fname_compressed, format="sas7bdat") + tm.assert_frame_equal(df_uncompressed, df_compressed)