Skip to content

Commit 1f52991

Browse files
committed
ENH: Add compression to read_stata and StataReader
Add support for reading compressed dta files directly xref pandas-dev#26599
1 parent b8890eb commit 1f52991

File tree

3 files changed

+66
-2
lines changed

3 files changed

+66
-2
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Other enhancements
5353
- :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
5454
- :meth:`Series.apply` can now accept list-like or dictionary-like arguments that aren't lists or dictionaries, e.g. ``ser.apply(np.array(["sum", "mean"]))``, which was already the case for :meth:`DataFrame.apply` (:issue:`39140`)
5555
- :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes.
56+
- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
5657

5758
.. ---------------------------------------------------------------------------
5859

pandas/io/stata.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,19 @@
9898
Return StataReader object for iterations, returns chunks with
9999
given number of lines."""
100100

101+
_compression_params = f"""\
102+
compression : str or dict, default None
103+
If string, specifies compression mode. If dict, value at key 'method'
104+
specifies compression mode. Compression mode must be one of {{'infer',
105+
'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer'
106+
and `filepath_or_buffer` is path-like, then detect compression from
107+
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
108+
no compression). If dict and compression mode is one of
109+
{{'zip', 'gzip', 'bz2'}}, or inferred as one of the above,
110+
other entries passed as additional compression options.
111+
{generic._shared_docs["storage_options"]}"""
112+
113+
101114
_iterator_params = """\
102115
iterator : bool, default False
103116
Return StataReader object."""
@@ -129,6 +142,7 @@
129142
{_statafile_processing_params2}
130143
{_chunksize_params}
131144
{_iterator_params}
145+
{_compression_params}
132146
133147
Returns
134148
-------
@@ -180,6 +194,7 @@
180194
{_statafile_processing_params1}
181195
{_statafile_processing_params2}
182196
{_chunksize_params}
197+
{_compression_params}
183198
184199
{_reader_notes}
185200
"""
@@ -1026,6 +1041,7 @@ def __init__(
10261041
columns: Optional[Sequence[str]] = None,
10271042
order_categoricals: bool = True,
10281043
chunksize: Optional[int] = None,
1044+
compression: CompressionOptions = "infer",
10291045
storage_options: StorageOptions = None,
10301046
):
10311047
super().__init__()
@@ -1064,10 +1080,10 @@ def __init__(
10641080
"rb",
10651081
storage_options=storage_options,
10661082
is_text=False,
1083+
compression=compression,
10671084
) as handles:
10681085
# Copy to BytesIO, and ensure no encoding
1069-
contents = handles.handle.read()
1070-
self.path_or_buf = BytesIO(contents) # type: ignore[arg-type]
1086+
self.path_or_buf = BytesIO(handles.handle.read()) # type: ignore[arg-type]
10711087

10721088
self._read_header()
10731089
self._setup_dtype()
@@ -1898,6 +1914,7 @@ def read_stata(
18981914
order_categoricals: bool = True,
18991915
chunksize: Optional[int] = None,
19001916
iterator: bool = False,
1917+
compression: CompressionOptions = "infer",
19011918
storage_options: StorageOptions = None,
19021919
) -> Union[DataFrame, StataReader]:
19031920

@@ -1912,6 +1929,7 @@ def read_stata(
19121929
order_categoricals=order_categoricals,
19131930
chunksize=chunksize,
19141931
storage_options=storage_options,
1932+
compression=compression,
19151933
)
19161934

19171935
if iterator or chunksize:

pandas/tests/io/test_stata.py

+45
Original file line numberDiff line numberDiff line change
@@ -2003,3 +2003,48 @@ def test_precision_loss():
20032003
tm.assert_series_equal(reread.dtypes, expected_dt)
20042004
assert reread.loc[0, "little"] == df.loc[0, "little"]
20052005
assert reread.loc[0, "big"] == float(df.loc[0, "big"])
2006+
2007+
2008+
def test_compression_roundtrip(compression):
2009+
df = DataFrame(
2010+
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
2011+
index=["A", "B"],
2012+
columns=["X", "Y", "Z"],
2013+
)
2014+
df.index.name = "index"
2015+
2016+
with tm.ensure_clean() as path:
2017+
2018+
df.to_stata(path, compression=compression)
2019+
reread = read_stata(path, compression=compression, index_col="index")
2020+
tm.assert_frame_equal(df, reread)
2021+
2022+
# explicitly ensure file was compressed.
2023+
with tm.decompress_file(path, compression) as fh:
2024+
contents = io.BytesIO(fh.read())
2025+
reread = pd.read_stata(contents, index_col="index")
2026+
tm.assert_frame_equal(df, reread)
2027+
2028+
2029+
@pytest.mark.parametrize("to_infer", [True, False])
2030+
@pytest.mark.parametrize("read_infer", [True, False])
2031+
def test_stata_compression(compression_only, read_infer, to_infer):
2032+
compression = compression_only
2033+
2034+
ext = "gz" if compression == "gzip" else compression
2035+
filename = f"test.{ext}"
2036+
2037+
df = DataFrame(
2038+
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
2039+
index=["A", "B"],
2040+
columns=["X", "Y", "Z"],
2041+
)
2042+
df.index.name = "index"
2043+
2044+
to_compression = "infer" if to_infer else compression
2045+
read_compression = "infer" if read_infer else compression
2046+
2047+
with tm.ensure_clean(filename) as path:
2048+
df.to_stata(path, compression=to_compression)
2049+
result = pd.read_stata(path, compression=read_compression, index_col="index")
2050+
tm.assert_frame_equal(result, df)

0 commit comments

Comments
 (0)