ENH: Add Stata 119 writer

bashtage · bashtage · commit e1c706ca766a · 2020-01-13T11:11:38.000Z
Add support for writing Stata 119 format files
Rename new writer to StataWriterUTF8 since no longer version specific
Improve exception message for unsupported files
Fix small issues in to_stata missed in 118
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -224,7 +224,7 @@ Other enhancements
 - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`)
 - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`)
 - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`)
-- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``.  This format supports exporting strings containing Unicode characters (:issue:`23573`)
+- Added new writer for exporting Stata dta files in version 118 and 119, ``StataWriterUTF8``.  This format supports exporting strings containing Unicode characters (:issue:`23573`)
 - :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`)
 - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`)
 - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1898,12 +1898,16 @@ def to_stata(
         variable_labels : dict
             Dictionary containing columns as keys and variable labels as
             values. Each label must be 80 characters or smaller.
-        version : {114, 117}, default 114
-            Version to use in the output dta file.  Version 114 can be used
-            read by Stata 10 and later.  Version 117 can be read by Stata 13
-            or later. Version 114 limits string variables to 244 characters or
-            fewer while 117 allows strings with lengths up to 2,000,000
-            characters.
+        version : {114, 117, 118, 119, "utf-8"}, default 114
+            Version to use in the output dta file. Set to None to let pandas
+            decide between 118 or 119 formats depending on the number of
+            columns in the frame. Version 114 can be read by Stata 10 and
+            later. Version 117 can be read by Stata 13 or later. Version 118
+            is supported in Stata 14 and later. Version 119 is supported in
+            Stata 15 and later. Version 114 limits string variables to 244
+            characters or fewer while 117 allows strings with lengths up to
+            2,000,000 characters. Versions 118 and 119 support Unicode
+            characters, and version 119 supports more than 32,767 variables.
 
             .. versionadded:: 0.23.0
 
@@ -1940,7 +1944,7 @@ def to_stata(
         >>> df.to_stata('animals.dta')  # doctest: +SKIP
         """
         kwargs = {}
-        if version not in (114, 117, 118):
+        if version not in (114, 117, 118, 119, None):
             raise ValueError("Only formats 114, 117 and 118 are supported.")
         if version == 114:
             if convert_strl is not None:
@@ -1949,8 +1953,10 @@ def to_stata(
         else:
             if version == 117:
                 from pandas.io.stata import StataWriter117 as statawriter
-            else:
-                from pandas.io.stata import StataWriter118 as statawriter
+            else:  # versions 118 and 119
+                from pandas.io.stata import StataWriterUTF8 as statawriter
+
+                kwargs["version"] = version if version != "utf8" else None
 
             kwargs["convert_strl"] = convert_strl
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -47,9 +47,10 @@
 from pandas.io.common import get_filepath_or_buffer, stringify_path
 
 _version_error = (
-    "Version of given Stata file is not 104, 105, 108, "
-    "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
-    "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)"
+    "Version of given Stata file is {version}. pandas supports importing "
+    "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
+    "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
+    "and 119 (Stata 15/16, over 32,767 variables)."
 )
 
 _statafile_processing_params1 = """\
@@ -1091,11 +1092,11 @@ def _read_header(self):
         self.col_sizes = [self._calcsize(typ) for typ in self.typlist]
 
     def _read_new_header(self, first_char):
-        # The first part of the header is common to 117 and 118.
+        # The first part of the header is common to 117 - 119.
         self.path_or_buf.read(27)  # stata_dta><header><release>
         self.format_version = int(self.path_or_buf.read(3))
         if self.format_version not in [117, 118, 119]:
-            raise ValueError(_version_error)
+            raise ValueError(_version_error.format(version=self.format_version))
         self._set_encoding()
         self.path_or_buf.read(21)  # </release><byteorder>
         self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<"
@@ -1288,7 +1289,7 @@ def _get_seek_variable_labels(self):
     def _read_old_header(self, first_char):
         self.format_version = struct.unpack("b", first_char)[0]
         if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
-            raise ValueError(_version_error)
+            raise ValueError(_version_error.format(version=self.format_version))
         self._set_encoding()
         self.byteorder = (
             struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<"
@@ -2884,7 +2885,6 @@ class StataWriter117(StataWriter):
     """
 
     _max_string_length = 2045
-    _dta_version = 117
 
     def __init__(
         self,
@@ -2900,6 +2900,7 @@ def __init__(
     ):
         # Shallow copy since convert_strl might be modified later
         self._convert_strl = [] if convert_strl is None else convert_strl[:]
+        self._dta_version = 117
 
         super().__init__(
             fname,
@@ -2934,9 +2935,14 @@ def _write_header(self, data_label=None, time_stamp=None):
         bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))
         # byteorder
         bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))
-        # number of vars, 2 bytes
-        assert self.nvar < 2 ** 16
-        bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K"))
+        if self._dta_version < 119 and self.nvar > 32767:
+            raise RuntimeError(
+                "You must use version 119 for data sets containing more than"
+                "32,767 variables"
+            )
+        # number of vars, 2 bytes in 117 and 118, 4 byte in 119
+        nvar_type = "H" if self._dta_version <= 118 else "I"
+        bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))
         # 117 uses 4 bytes, 118 uses 8
         nobs_size = "I" if self._dta_version == 117 else "Q"
         bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))
@@ -3033,7 +3039,8 @@ def _write_varnames(self):
 
     def _write_sortlist(self):
         self._update_map("sortlist")
-        self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist"))
+        sort_size = 2 if self._dta_version < 119 else 4
+        self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))
 
     def _write_formats(self):
         self._update_map("formats")
@@ -3173,13 +3180,14 @@ def _set_formats_and_types(self, dtypes):
             )
 
 
-class StataWriter118(StataWriter117):
+class StataWriterUTF8(StataWriter117):
     """
-    A class for writing Stata binary dta files in Stata 15 format (118)
+    Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
 
-    DTA 118 format files support unicode string data (both fixed and strL)
-    format. Unicode is also supported in value labels, variable labels and
-    the dataset label.
+    DTA 118 and 119 format files support unicode string data (both fixed
+    and strL) format. Unicode is also supported in value labels, variable
+    labels and the dataset label. Format 119 is automatically used if the
+    file contains more than 32,767 variables.
 
     .. versionadded:: 1.0.0
 
@@ -3216,10 +3224,14 @@ class StataWriter118(StataWriter117):
         Smaller columns can be converted by including the column name.  Using
         StrLs can reduce output file size when strings are longer than 8
         characters, and either frequently repeated or sparse.
+    version : int, optional
+        The dta version to use. By default, uses the size of data to determine
+        the version. 118 is used if data.shape[1] <= 32767, and 119 is used
+        for storing larger DataFrames.
 
     Returns
     -------
-    StataWriter118
+    StataWriterUTF8
         The instance has a write_file method, which will write the file to the
         given `fname`.
 
@@ -3238,22 +3250,52 @@ class StataWriter118(StataWriter117):
     --------
     Using Unicode data and column names
 
-    >>> from pandas.io.stata import StataWriter118
+    >>> from pandas.io.stata import StataWriterUTF8
     >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
-    >>> writer = StataWriter118('./data_file.dta', data)
+    >>> writer = StataWriterUTF8('./data_file.dta', data)
     >>> writer.write_file()
 
     Or with long strings stored in strl format
 
     >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
     ...                     columns=['strls'])
-    >>> writer = StataWriter118('./data_file_with_long_strings.dta', data,
-    ...                         convert_strl=['strls'])
+    >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,
+    ...                          convert_strl=['strls'])
     >>> writer.write_file()
     """
 
     _encoding = "utf-8"
-    _dta_version = 118
+
+    def __init__(
+        self,
+        fname,
+        data,
+        convert_dates=None,
+        write_index=True,
+        byteorder=None,
+        time_stamp=None,
+        data_label=None,
+        variable_labels=None,
+        convert_strl=None,
+        version=None,
+    ):
+        if version is None:
+            version = 118 if data.shape[1] <= 32767 else 119
+        elif version not in (118, 119):
+            raise ValueError("version must be either 118 or 119.")
+        super().__init__(
+            fname,
+            data,
+            convert_dates=convert_dates,
+            write_index=write_index,
+            byteorder=byteorder,
+            time_stamp=time_stamp,
+            data_label=data_label,
+            variable_labels=variable_labels,
+            convert_strl=convert_strl,
+        )
+        # Override version set in StataWriter117 init
+        self._dta_version = version
 
     def _validate_variable_name(self, name):
         """
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -21,7 +21,7 @@
     PossiblePrecisionLoss,
     StataMissingValue,
     StataReader,
-    StataWriter118,
+    StataWriterUTF8,
     read_stata,
 )
 
@@ -1770,7 +1770,8 @@ def test_stata_119(self):
         assert df.iloc[0, -1] == 1
         assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21))
 
-    def test_118_writer(self):
+    @pytest.mark.parametrize("version", [118, 119, None])
+    def test_utf8_writer(self, version):
         cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
         data = pd.DataFrame(
             [
@@ -1791,13 +1792,14 @@ def test_118_writer(self):
         data_label = "ᴅaᵀa-label"
         data["β"] = data["β"].astype(np.int32)
         with tm.ensure_clean() as path:
-            writer = StataWriter118(
+            writer = StataWriterUTF8(
                 path,
                 data,
                 data_label=data_label,
                 convert_strl=["strls"],
                 variable_labels=variable_labels,
                 write_index=False,
+                version=version,
             )
             writer.write_file()
             reread_encoded = read_stata(path)
@@ -1807,3 +1809,7 @@ def test_118_writer(self):
             reader = StataReader(path)
             assert reader.data_label == data_label
             assert reader.variable_labels() == variable_labels
+
+            data.to_stata(path, version=version, write_index=False)
+            reread_to_stata = read_stata(path)
+            tm.assert_frame_equal(data, reread_to_stata)