ENH: Add compression to stata exporters

bashtage · bashtage · commit f1c87bfad1fc · 2020-05-05T23:16:53.000+01:00
Add standard compression optons to stata exporters closes #26599
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -25,6 +25,7 @@
     Iterable,
     Iterator,
     List,
+    Mapping,
     Optional,
     Sequence,
     Set,
@@ -1975,6 +1976,7 @@ def to_stata(
         variable_labels: Optional[Dict[Label, str]] = None,
         version: Optional[int] = 114,
         convert_strl: Optional[Sequence[Label]] = None,
+        compression: Optional[str] = "infer",
     ) -> None:
         """
         Export DataFrame object to Stata dta format.
@@ -2038,6 +2040,14 @@ def to_stata(
 
             .. versionadded:: 0.23.0
 
+        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+            For on-the-fly compression of the output dta. If 'infer', then use
+            gzip, bz2, zip or xz if path_or_buf is a string ending in
+            '.gz', '.bz2', '.zip', or 'xz', respectively, and no compression
+            otherwise.
+
+            .. versionadded:: 1.2.0
+
         Raises
         ------
         NotImplementedError
@@ -2093,6 +2103,7 @@ def to_stata(
             data_label=data_label,
             write_index=write_index,
             variable_labels=variable_labels,
+            compression=compression,
             **kwargs,
         )
         writer.write_file()
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -16,7 +16,18 @@
 from pathlib import Path
 import struct
 import sys
-from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union
+from typing import (
+    Any,
+    AnyStr,
+    BinaryIO,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    Mapping,
+)
 import warnings
 
 from dateutil.relativedelta import relativedelta
@@ -47,7 +58,13 @@
 from pandas.core.indexes.base import Index
 from pandas.core.series import Series
 
-from pandas.io.common import get_filepath_or_buffer, stringify_path
+from pandas.io.common import (
+    get_compression_method,
+    get_filepath_or_buffer,
+    stringify_path,
+    get_handle,
+    infer_compression,
+)
 
 _version_error = (
     "Version of given Stata file is {version}. pandas supports importing "
@@ -1854,7 +1871,9 @@ def read_stata(
     return data
 
 
-def _open_file_binary_write(fname: FilePathOrBuffer) -> Tuple[BinaryIO, bool]:
+def _open_file_binary_write(
+    fname: FilePathOrBuffer, compression: Optional[str]
+) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]:
     """
     Open a binary file or no-op if file-like.
 
@@ -1871,9 +1890,15 @@ def _open_file_binary_write(fname: FilePathOrBuffer) -> Tuple[BinaryIO, bool]:
     """
     if hasattr(fname, "write"):
         # See https://github.com/python/mypy/issues/1424 for hasattr challenges
-        return fname, False  # type: ignore
+        return fname, False, None  # type: ignore
     elif isinstance(fname, (str, Path)):
-        return open(fname, "wb"), True
+        # Extract compression mode as given, if dict
+        compression = infer_compression(fname, compression)
+        path_or_buf, _, compression, _ = get_filepath_or_buffer(
+            fname, compression=compression
+        )
+        f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False)
+        return f, True, compression
     else:
         raise TypeError("fname must be a binary file, buffer or path-like.")
 
@@ -2050,6 +2075,13 @@ class StataWriter(StataParser):
     variable_labels : dict
         Dictionary containing columns as keys and variable labels as values.
         Each label must be 80 characters or smaller.
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+        For on-the-fly compression of the output dta. If 'infer', then use
+        gzip, bz2, zip or xz if path_or_buf is a string ending in
+        '.gz', '.bz2', '.zip', or 'xz', respectively, and no compression
+        otherwise.
+
+        .. versionadded:: 1.2.0
 
     Returns
     -------
@@ -2094,6 +2126,7 @@ def __init__(
         time_stamp: Optional[datetime.datetime] = None,
         data_label: Optional[str] = None,
         variable_labels: Optional[Dict[Label, str]] = None,
+        compression: Optional[str] = "infer",
     ):
         super().__init__()
         self._convert_dates = {} if convert_dates is None else convert_dates
@@ -2102,6 +2135,8 @@ def __init__(
         self._data_label = data_label
         self._variable_labels = variable_labels
         self._own_file = True
+        self._compression = compression
+        self._output_file: Optional[BinaryIO] = None
         # attach nobs, nvars, data, varlist, typlist
         self._prepare_pandas(data)
 
@@ -2389,7 +2424,12 @@ def _encode_strings(self) -> None:
                     self.data[col] = encoded
 
     def write_file(self) -> None:
-        self._file, self._own_file = _open_file_binary_write(self._fname)
+        self._file, self._own_file, compression = _open_file_binary_write(
+            self._fname, self._compression
+        )
+        if compression is not None:
+            self._output_file = self._file
+            self._file = BytesIO()
         try:
             self._write_header(data_label=self._data_label, time_stamp=self._time_stamp)
             self._write_map()
@@ -2434,6 +2474,12 @@ def _close(self) -> None:
         """
         # Some file-like objects might not support flush
         assert self._file is not None
+        if self._output_file is not None:
+            assert isinstance(self._file, BytesIO)
+            bio = self._file
+            bio.seek(0)
+            self._file = self._output_file
+            self._file.write(bio.read())
         try:
             self._file.flush()
         except AttributeError:
@@ -2898,6 +2944,13 @@ class StataWriter117(StataWriter):
         Smaller columns can be converted by including the column name.  Using
         StrLs can reduce output file size when strings are longer than 8
         characters, and either frequently repeated or sparse.
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+        For on-the-fly compression of the output dta. If 'infer', then use
+        gzip, bz2, zip or xz if path_or_buf is a string ending in
+        '.gz', '.bz2', '.zip', or 'xz', respectively, and no compression
+        otherwise.
+
+        .. versionadded:: 1.2.0
 
     Returns
     -------
@@ -2946,6 +2999,7 @@ def __init__(
         data_label: Optional[str] = None,
         variable_labels: Optional[Dict[Label, str]] = None,
         convert_strl: Optional[Sequence[Label]] = None,
+        compression: Optional[str] = "infer",
     ):
         # Copy to new list since convert_strl might be modified later
         self._convert_strl: List[Label] = []
@@ -2961,6 +3015,7 @@ def __init__(
             time_stamp=time_stamp,
             data_label=data_label,
             variable_labels=variable_labels,
+            compression=compression,
         )
         self._map: Dict[str, int] = {}
         self._strl_blob = b""
@@ -3281,6 +3336,13 @@ class StataWriterUTF8(StataWriter117):
         The dta version to use. By default, uses the size of data to determine
         the version. 118 is used if data.shape[1] <= 32767, and 119 is used
         for storing larger DataFrames.
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+        For on-the-fly compression of the output dta. If 'infer', then use
+        gzip, bz2, zip or xz if path_or_buf is a string ending in
+        '.gz', '.bz2', '.zip', or 'xz', respectively, and no compression
+        otherwise.
+
+        .. versionadded:: 1.2.0
 
     Returns
     -------
@@ -3331,6 +3393,7 @@ def __init__(
         variable_labels: Optional[Dict[Label, str]] = None,
         convert_strl: Optional[Sequence[Label]] = None,
         version: Optional[int] = None,
+        compression: Optional[str] = "infer",
     ):
         if version is None:
             version = 118 if data.shape[1] <= 32767 else 119
@@ -3352,6 +3415,7 @@ def __init__(
             data_label=data_label,
             variable_labels=variable_labels,
             convert_strl=convert_strl,
+            compression=compression,
         )
         # Override version set in StataWriter117 init
         self._dta_version = version
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1853,3 +1853,32 @@ def test_writer_118_exceptions(self):
         with tm.ensure_clean() as path:
             with pytest.raises(ValueError, match="You must use version 119"):
                 StataWriterUTF8(path, df, version=118)
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+@pytest.mark.parametrize("file_type", ["", "zip", "gz"])
+def test_infer_compression(file_type, version):
+    file_name = "dta_inferred_compression.dta"
+    if file_type:
+        file_name += f".{file_type}"
+    df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
+    df.index.name = "index"
+    with tm.ensure_clean(file_name) as path:
+        df.to_stata(path, version=version)
+        if file_type == "gz":
+            import gzip
+
+            with gzip.open(path, "rb") as comp:
+                reread = read_stata(comp, index_col="index")
+        elif file_type == "zip":
+            import zipfile
+
+            zf = zipfile.ZipFile(path, "r")
+            for name in zf.namelist():
+                bio = io.BytesIO(zf.read(name))
+                bio.seek(0)
+                reread = read_stata(bio, index_col="index")
+        else:
+            # No compression
+            reread = read_stata(path, index_col="index")
+        tm.assert_frame_equal(df, reread)