Skip to content

Commit 7397adc

Browse files
jonashaagjreback
andauthored
Allow reading SAS files from archives (#47154)
* Allow reading SAS files from archives * Add missing file * Review feedback * Fix Co-authored-by: Jeff Reback <[email protected]>
1 parent a8153a8 commit 7397adc

File tree

6 files changed

+45
-7
lines changed

6 files changed

+45
-7
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ Other enhancements
176176
- ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`)
177177
- :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, :class:`SettingWithCopyWarning`, :class:`NumExprClobberingError`, :class:`UndefinedVariableError`, and :class:`IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`)
178178
- Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
179+
- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
179180

180181
.. ---------------------------------------------------------------------------
181182
.. _whatsnew_150.notable_bug_fixes:

pandas/io/sas/sas7bdat.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import numpy as np
2727

2828
from pandas._typing import (
29+
CompressionOptions,
2930
FilePath,
3031
ReadBuffer,
3132
)
@@ -168,6 +169,7 @@ def __init__(
168169
encoding=None,
169170
convert_text=True,
170171
convert_header_text=True,
172+
compression: CompressionOptions = "infer",
171173
) -> None:
172174

173175
self.index = index
@@ -195,7 +197,9 @@ def __init__(
195197
self._current_row_on_page_index = 0
196198
self._current_row_in_file_index = 0
197199

198-
self.handles = get_handle(path_or_buf, "rb", is_text=False)
200+
self.handles = get_handle(
201+
path_or_buf, "rb", is_text=False, compression=compression
202+
)
199203

200204
self._path_or_buf = self.handles.handle
201205

pandas/io/sas/sas_xport.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import numpy as np
1818

1919
from pandas._typing import (
20+
CompressionOptions,
2021
DatetimeNaTType,
2122
FilePath,
2223
ReadBuffer,
@@ -256,6 +257,7 @@ def __init__(
256257
index=None,
257258
encoding: str | None = "ISO-8859-1",
258259
chunksize=None,
260+
compression: CompressionOptions = "infer",
259261
) -> None:
260262

261263
self._encoding = encoding
@@ -264,7 +266,11 @@ def __init__(
264266
self._chunksize = chunksize
265267

266268
self.handles = get_handle(
267-
filepath_or_buffer, "rb", encoding=encoding, is_text=False
269+
filepath_or_buffer,
270+
"rb",
271+
encoding=encoding,
272+
is_text=False,
273+
compression=compression,
268274
)
269275
self.filepath_or_buffer = self.handles.handle
270276

pandas/io/sas/sasreader.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,16 @@
1414
)
1515

1616
from pandas._typing import (
17+
CompressionOptions,
1718
FilePath,
1819
ReadBuffer,
1920
)
21+
from pandas.util._decorators import (
22+
deprecate_nonkeyword_arguments,
23+
doc,
24+
)
25+
26+
from pandas.core.shared_docs import _shared_docs
2027

2128
from pandas.io.common import stringify_path
2229

@@ -53,6 +60,7 @@ def read_sas(
5360
encoding: str | None = ...,
5461
chunksize: int = ...,
5562
iterator: bool = ...,
63+
compression: CompressionOptions = ...,
5664
) -> ReaderBase:
5765
...
5866

@@ -65,17 +73,23 @@ def read_sas(
6573
encoding: str | None = ...,
6674
chunksize: None = ...,
6775
iterator: bool = ...,
76+
compression: CompressionOptions = ...,
6877
) -> DataFrame | ReaderBase:
6978
...
7079

7180

81+
@deprecate_nonkeyword_arguments(
82+
version=None, allowed_args=["filepath_or_buffer"], stacklevel=2
83+
)
84+
@doc(decompression_options=_shared_docs["decompression_options"])
7285
def read_sas(
7386
filepath_or_buffer: FilePath | ReadBuffer[bytes],
7487
format: str | None = None,
7588
index: Hashable | None = None,
7689
encoding: str | None = None,
7790
chunksize: int | None = None,
7891
iterator: bool = False,
92+
compression: CompressionOptions = "infer",
7993
) -> DataFrame | ReaderBase:
8094
"""
8195
Read SAS files stored as either XPORT or SAS7BDAT format files.
@@ -88,7 +102,7 @@ def read_sas(
88102
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
89103
expected. A local file could be:
90104
``file://localhost/path/to/table.sas``.
91-
format : str {'xport', 'sas7bdat'} or None
105+
format : str {{'xport', 'sas7bdat'}} or None
92106
If None, file format is inferred from file extension. If 'xport' or
93107
'sas7bdat', uses the corresponding format.
94108
index : identifier of index column, defaults to None
@@ -107,6 +121,7 @@ def read_sas(
107121
.. versionchanged:: 1.2
108122
109123
``TextFileReader`` is a context manager.
124+
{decompression_options}
110125
111126
Returns
112127
-------
@@ -122,12 +137,14 @@ def read_sas(
122137
if not isinstance(filepath_or_buffer, str):
123138
raise ValueError(buffer_error_msg)
124139
fname = filepath_or_buffer.lower()
125-
if fname.endswith(".xpt"):
140+
if ".xpt" in fname:
126141
format = "xport"
127-
elif fname.endswith(".sas7bdat"):
142+
elif ".sas7bdat" in fname:
128143
format = "sas7bdat"
129144
else:
130-
raise ValueError("unable to infer format of SAS file")
145+
raise ValueError(
146+
f"unable to infer format of SAS file from filename: {repr(fname)}"
147+
)
131148

132149
reader: ReaderBase
133150
if format.lower() == "xport":
@@ -138,6 +155,7 @@ def read_sas(
138155
index=index,
139156
encoding=encoding,
140157
chunksize=chunksize,
158+
compression=compression,
141159
)
142160
elif format.lower() == "sas7bdat":
143161
from pandas.io.sas.sas7bdat import SAS7BDATReader
@@ -147,6 +165,7 @@ def read_sas(
147165
index=index,
148166
encoding=encoding,
149167
chunksize=chunksize,
168+
compression=compression,
150169
)
151170
else:
152171
raise ValueError("unknown SAS format")
1.4 KB
Binary file not shown.

pandas/tests/io/sas/test_sas.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,15 @@ def test_sas_buffer_format(self):
2020

2121
def test_sas_read_no_format_or_extension(self):
2222
# see gh-24548
23-
msg = "unable to infer format of SAS file"
23+
msg = "unable to infer format of SAS file.+"
2424
with tm.ensure_clean("test_file_no_extension") as path:
2525
with pytest.raises(ValueError, match=msg):
2626
read_sas(path)
27+
28+
29+
def test_sas_archive(datapath):
30+
fname_uncompressed = datapath("io", "sas", "data", "airline.sas7bdat")
31+
df_uncompressed = read_sas(fname_uncompressed)
32+
fname_compressed = datapath("io", "sas", "data", "airline.sas7bdat.gz")
33+
df_compressed = read_sas(fname_compressed, format="sas7bdat")
34+
tm.assert_frame_equal(df_uncompressed, df_compressed)

0 commit comments

Comments
 (0)