pandas-dev · mroeschke · Jun 15, 2022 · May 27, 2022 · May 27, 2022 · May 27, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -154,6 +154,7 @@ Other enhancements
 - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`)
 - :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, :class:`SettingWithCopyWarning`, and :class:`NumExprClobberingError` are now exposed in ``pandas.errors`` (:issue:`27656`)
 - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
+- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:

diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -26,6 +26,7 @@
 import numpy as np
 
 from pandas._typing import (
+    CompressionOptions,
     FilePath,
     ReadBuffer,
 )
@@ -168,6 +169,7 @@ def __init__(
         encoding=None,
         convert_text=True,
         convert_header_text=True,
+        compression: CompressionOptions = "infer",
     ) -> None:
 
         self.index = index
@@ -195,7 +197,9 @@ def __init__(
         self._current_row_on_page_index = 0
         self._current_row_in_file_index = 0
 
-        self.handles = get_handle(path_or_buf, "rb", is_text=False)
+        self.handles = get_handle(
+            path_or_buf, "rb", is_text=False, compression=compression
+        )
 
         self._path_or_buf = self.handles.handle
 

diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from pandas._typing import (
+    CompressionOptions,
     DatetimeNaTType,
     FilePath,
     ReadBuffer,
@@ -256,6 +257,7 @@ def __init__(
         index=None,
         encoding: str | None = "ISO-8859-1",
         chunksize=None,
+        compression: CompressionOptions = "infer",
     ) -> None:
 
         self._encoding = encoding
@@ -264,7 +266,11 @@ def __init__(
         self._chunksize = chunksize
 
         self.handles = get_handle(
-            filepath_or_buffer, "rb", encoding=encoding, is_text=False
+            filepath_or_buffer,
+            "rb",
+            encoding=encoding,
+            is_text=False,
+            compression=compression,
         )
         self.filepath_or_buffer = self.handles.handle
 

diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
@@ -14,9 +14,16 @@
 )
 
 from pandas._typing import (
+    CompressionOptions,
     FilePath,
     ReadBuffer,
 )
+from pandas.util._decorators import (
+    Substitution,
+    deprecate_nonkeyword_arguments,
+)
+
+from pandas.core.shared_docs import _shared_docs
 
 from pandas.io.common import stringify_path
 
@@ -53,6 +60,7 @@ def read_sas(
     encoding: str | None = ...,
     chunksize: int = ...,
     iterator: bool = ...,
+    compression: CompressionOptions = ...,
 ) -> ReaderBase:
     ...
 
@@ -65,17 +73,23 @@ def read_sas(
     encoding: str | None = ...,
     chunksize: None = ...,
     iterator: bool = ...,
+    compression: CompressionOptions = ...,
 ) -> DataFrame | ReaderBase:
     ...
 
 
+@deprecate_nonkeyword_arguments(
+    version=None, allowed_args=["filepath_or_buffer"], stacklevel=2
+)
+@Substitution(decompression_options=_shared_docs["decompression_options"])
 def read_sas(
     filepath_or_buffer: FilePath | ReadBuffer[bytes],
     format: str | None = None,
     index: Hashable | None = None,
     encoding: str | None = None,
     chunksize: int | None = None,
     iterator: bool = False,
+    compression: CompressionOptions = "infer",
 ) -> DataFrame | ReaderBase:
     """
     Read SAS files stored as either XPORT or SAS7BDAT format files.
@@ -107,6 +121,7 @@ def read_sas(
         .. versionchanged:: 1.2
 
             ``TextFileReader`` is a context manager.
+    %(decompression_options)s
 
     Returns
     -------
@@ -122,12 +137,14 @@ def read_sas(
         if not isinstance(filepath_or_buffer, str):
             raise ValueError(buffer_error_msg)
         fname = filepath_or_buffer.lower()
-        if fname.endswith(".xpt"):
+        if ".xpt" in fname:
             format = "xport"
-        elif fname.endswith(".sas7bdat"):
+        elif ".sas7bdat" in fname:
             format = "sas7bdat"
         else:
-            raise ValueError("unable to infer format of SAS file")
+            raise ValueError(
+                f"unable to infer format of SAS file from filename: {repr(fname)}"
+            )
 
     reader: ReaderBase
     if format.lower() == "xport":
@@ -138,6 +155,7 @@ def read_sas(
             index=index,
             encoding=encoding,
             chunksize=chunksize,
+            compression=compression,
         )
     elif format.lower() == "sas7bdat":
         from pandas.io.sas.sas7bdat import SAS7BDATReader
@@ -147,6 +165,7 @@ def read_sas(
             index=index,
             encoding=encoding,
             chunksize=chunksize,
+            compression=compression,
         )
     else:
         raise ValueError("unknown SAS format")

diff --git a/pandas/tests/io/sas/data/airline.sas7bdat.gz b/pandas/tests/io/sas/data/airline.sas7bdat.gz
diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py
@@ -20,7 +20,15 @@ def test_sas_buffer_format(self):
 
     def test_sas_read_no_format_or_extension(self):
         # see gh-24548
-        msg = "unable to infer format of SAS file"
+        msg = "unable to infer format of SAS file.+"
         with tm.ensure_clean("test_file_no_extension") as path:
             with pytest.raises(ValueError, match=msg):
                 read_sas(path)
+
+
+def test_sas_archive(datapath):
+    fname_uncompressed = datapath("io", "sas", "data", "airline.sas7bdat")
+    df_uncompressed = read_sas(fname_uncompressed)
+    fname_compressed = datapath("io", "sas", "data", "airline.sas7bdat.gz")
+    df_compressed = read_sas(fname_compressed, format="sas7bdat")
+    tm.assert_frame_equal(df_uncompressed, df_compressed)