diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0879189a822f8..c423933d4c438 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -223,7 +223,7 @@ Other enhancements - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`) - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) -- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`) +- Added new writer for exporting Stata dta files in versions 118 and 119, ``StataWriterUTF8``. These files formats support exporting strings containing Unicode characters. Format 119 supports data sets with more than 32,767 variables (:issue:`23573`, :issue:`30959`) - :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`) - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 594b8a00a8672..42dc21156ba59 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1898,14 +1898,22 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - version : {114, 117}, default 114 - Version to use in the output dta file. Version 114 can be used - read by Stata 10 and later. Version 117 can be read by Stata 13 - or later. Version 114 limits string variables to 244 characters or - fewer while 117 allows strings with lengths up to 2,000,000 - characters. + version : {114, 117, 118, 119, None}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Added support for formats 118 and 119. convert_strl : list, optional List of column names to convert to string columns to Stata StrL @@ -1939,20 +1947,24 @@ def to_stata( ... 'speed': [350, 18, 361, 15]}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ - kwargs = {} - if version not in (114, 117, 118): - raise ValueError("Only formats 114, 117 and 118 are supported.") + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") if version == 114: if convert_strl is not None: raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter - else: - if version == 117: - from pandas.io.stata import StataWriter117 as statawriter - else: - from pandas.io.stata import StataWriter118 as statawriter + elif version == 117: + from pandas.io.stata import StataWriter117 as statawriter + else: # versions 118 and 119 + from pandas.io.stata import StataWriterUTF8 as statawriter + kwargs = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version writer = statawriter( path, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2c1222aad12cc..b8e04ad55dde1 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -15,7 +15,7 @@ import os import struct import sys -from typing import Any +from typing import Any, Dict, Hashable, Optional, Sequence import warnings from dateutil.relativedelta import relativedelta @@ -23,6 +23,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array +from pandas._typing import FilePathOrBuffer from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -47,9 +48,10 @@ from pandas.io.common import get_filepath_or_buffer, stringify_path _version_error = ( - "Version of given Stata file is not 104, 105, 108, " - "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " - "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" + "Version of given Stata file is {version}. pandas supports importing " + "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," + "and 119 (Stata 15/16, over 32,767 variables)." ) _statafile_processing_params1 = """\ @@ -1090,11 +1092,11 @@ def _read_header(self): self.col_sizes = [self._calcsize(typ) for typ in self.typlist] def _read_new_header(self, first_char): - # The first part of the header is common to 117 and 118. + # The first part of the header is common to 117 - 119. self.path_or_buf.read(27) # stata_dta>
self.format_version = int(self.path_or_buf.read(3)) if self.format_version not in [117, 118, 119]: - raise ValueError(_version_error) + raise ValueError(_version_error.format(version=self.format_version)) self._set_encoding() self.path_or_buf.read(21) # self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" @@ -1287,7 +1289,7 @@ def _get_seek_variable_labels(self): def _read_old_header(self, first_char): self.format_version = struct.unpack("b", first_char)[0] if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: - raise ValueError(_version_error) + raise ValueError(_version_error.format(version=self.format_version)) self._set_encoding() self.byteorder = ( struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" @@ -2695,7 +2697,7 @@ def _convert_key(self, key): def generate_table(self): """ - Generates the GSO lookup table for the DataFRame + Generates the GSO lookup table for the DataFrame Returns ------- @@ -2934,9 +2936,9 @@ def _write_header(self, data_label=None, time_stamp=None): bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) - # number of vars, 2 bytes - assert self.nvar < 2 ** 16 - bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) + # number of vars, 2 bytes in 117 and 118, 4 byte in 119 + nvar_type = "H" if self._dta_version <= 118 else "I" + bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) # 117 uses 4 bytes, 118 uses 8 nobs_size = "I" if self._dta_version == 117 else "Q" bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) @@ -3033,7 +3035,8 @@ def _write_varnames(self): def _write_sortlist(self): self._update_map("sortlist") - self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist")) + sort_size = 2 if self._dta_version < 119 else 4 + self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist")) def _write_formats(self): self._update_map("formats") @@ -3173,13 +3176,14 @@ def _set_formats_and_types(self, dtypes): ) -class StataWriter118(StataWriter117): +class StataWriterUTF8(StataWriter117): """ - A class for writing Stata binary dta files in Stata 15 format (118) + Stata binary dta file writing in Stata 15 (118) and 16 (119) formats - DTA 118 format files support unicode string data (both fixed and strL) - format. Unicode is also supported in value labels, variable labels and - the dataset label. + DTA 118 and 119 format files support unicode string data (both fixed + and strL) format. Unicode is also supported in value labels, variable + labels and the dataset label. Format 119 is automatically used if the + file contains more than 32,767 variables. .. versionadded:: 1.0.0 @@ -3192,34 +3196,38 @@ class StataWriter118(StataWriter117): is written. data : DataFrame Input to save - convert_dates : dict + convert_dates : dict, default None Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. Datetime columns that do not have a conversion type specified will be converted to 'tc'. Raises NotImplementedError if a datetime column has timezone information - write_index : bool + write_index : bool, default True Write the index to Stata dataset. - byteorder : str + byteorder : str, default None Can be ">", "<", "little", or "big". default is `sys.byteorder` - time_stamp : datetime + time_stamp : datetime, default None A datetime to use as file creation date. Default is the current time - data_label : str + data_label : str, default None A label for the data set. Must be 80 characters or smaller. - variable_labels : dict + variable_labels : dict, default None Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - convert_strl : list + convert_strl : list, default None List of columns names to convert to Stata StrL format. Columns with more than 2045 characters are automatically written as StrL. Smaller columns can be converted by including the column name. Using StrLs can reduce output file size when strings are longer than 8 characters, and either frequently repeated or sparse. + version : int, default None + The dta version to use. By default, uses the size of data to determine + the version. 118 is used if data.shape[1] <= 32767, and 119 is used + for storing larger DataFrames. Returns ------- - StataWriter118 + StataWriterUTF8 The instance has a write_file method, which will write the file to the given `fname`. @@ -3238,24 +3246,60 @@ class StataWriter118(StataWriter117): -------- Using Unicode data and column names - >>> from pandas.io.stata import StataWriter118 + >>> from pandas.io.stata import StataWriterUTF8 >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) - >>> writer = StataWriter118('./data_file.dta', data) + >>> writer = StataWriterUTF8('./data_file.dta', data) >>> writer.write_file() Or with long strings stored in strl format >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], ... columns=['strls']) - >>> writer = StataWriter118('./data_file_with_long_strings.dta', data, - ... convert_strl=['strls']) + >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) >>> writer.write_file() """ _encoding = "utf-8" - _dta_version = 118 - def _validate_variable_name(self, name): + def __init__( + self, + fname: FilePathOrBuffer, + data: DataFrame, + convert_dates: Optional[Dict[Hashable, str]] = None, + write_index: bool = True, + byteorder: Optional[str] = None, + time_stamp: Optional[datetime.datetime] = None, + data_label: Optional[str] = None, + variable_labels: Optional[Dict[Hashable, str]] = None, + convert_strl: Optional[Sequence[Hashable]] = None, + version: Optional[int] = None, + ): + if version is None: + version = 118 if data.shape[1] <= 32767 else 119 + elif version not in (118, 119): + raise ValueError("version must be either 118 or 119.") + elif version == 118 and data.shape[1] > 32767: + raise ValueError( + "You must use version 119 for data sets containing more than" + "32,767 variables" + ) + + super().__init__( + fname, + data, + convert_dates=convert_dates, + write_index=write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + convert_strl=convert_strl, + ) + # Override version set in StataWriter117 init + self._dta_version = version + + def _validate_variable_name(self, name: str) -> str: """ Validate variable names for Stata export. @@ -3272,7 +3316,7 @@ def _validate_variable_name(self, name): Notes ----- - Stata 118 support most unicode characters. The only limatation is in + Stata 118+ support most unicode characters. The only limitation is in the ascii range where the characters supported are a-z, A-Z, 0-9 and _. """ # High code points appear to be acceptable diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 1d3cddbf01738..8e459f0cf8298 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -21,11 +21,22 @@ PossiblePrecisionLoss, StataMissingValue, StataReader, - StataWriter118, + StataWriterUTF8, read_stata, ) +@pytest.fixture() +def mixed_frame(): + return pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": [1.0, 3.0, 27.0, 81.0], + "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], + } + ) + + @pytest.fixture def dirpath(datapath): return datapath("io", "data", "stata") @@ -112,7 +123,7 @@ def read_dta(self, file): def read_csv(self, file): return read_csv(file, parse_dates=True) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_empty_dta(self, version): empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file @@ -332,7 +343,7 @@ def test_write_dta6(self): check_index_type=False, ) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], @@ -368,7 +379,7 @@ def test_write_preserves_original(self): df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_encoding(self, version): # GH 4626, proper encoding handling @@ -409,7 +420,7 @@ def test_read_write_dta11(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta12(self, version): original = DataFrame( [(1, 2, 3, 4, 5, 6)], @@ -461,7 +472,7 @@ def test_read_write_dta13(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize( "file", ["dta14_113", "dta14_114", "dta14_115", "dta14_117"] ) @@ -504,7 +515,7 @@ def test_read_write_reread_dta15(self, file): tm.assert_frame_equal(expected, parsed) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_timestamp_and_label(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) @@ -518,7 +529,7 @@ def test_timestamp_and_label(self, version): assert reader.time_stamp == "29 Feb 2000 14:21" assert reader.data_label == data_label - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_invalid_timestamp(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = "01 Jan 2000, 00:00:00" @@ -542,7 +553,7 @@ def test_numeric_column_names(self): written_and_read_again.columns = map(convert_col_name, columns) tm.assert_frame_equal(original, written_and_read_again) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nan_to_missing_value(self, version): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) @@ -662,7 +673,7 @@ def test_write_missing_strings(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("byteorder", [">", "<"]) def test_bool_uint(self, byteorder, version): s0 = Series([0, 1, True], dtype=np.bool) @@ -908,7 +919,7 @@ def test_drop_column(self): columns = ["byte_", "int_", "long_", "not_found"] read_stata(self.dta15_117, convert_dates=True, columns=columns) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.filterwarnings( "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch" ) @@ -985,7 +996,7 @@ def test_categorical_warnings_and_errors(self): original.to_stata(path) # should get a warning for mixed content - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_categorical_with_stata_missing_values(self, version): values = [["a" + str(i)] for i in range(120)] values.append([np.nan]) @@ -1221,20 +1232,13 @@ def test_read_chunks_columns(self): tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize - @pytest.mark.parametrize("version", [114, 117]) - def test_write_variable_labels(self, version): + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_write_variable_labels(self, version, mixed_frame): # GH 13631, add support for writing variable labels - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) - original.index.name = "index" + mixed_frame.index.name = "index" variable_labels = {"a": "City Rank", "b": "City Exponent", "c": "City"} with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels, version=version) + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) with StataReader(path) as sr: read_labels = sr.variable_labels() expected_labels = { @@ -1247,46 +1251,36 @@ def test_write_variable_labels(self, version): variable_labels["index"] = "The Index" with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels, version=version) + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) with StataReader(path) as sr: read_labels = sr.variable_labels() assert read_labels == variable_labels - @pytest.mark.parametrize("version", [114, 117]) - def test_invalid_variable_labels(self, version): - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) - original.index.name = "index" + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_invalid_variable_labels(self, version, mixed_frame): + mixed_frame.index.name = "index" variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} with tm.ensure_clean() as path: msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): - original.to_stata( + mixed_frame.to_stata( path, variable_labels=variable_labels, version=version ) + @pytest.mark.parametrize("version", [114, 117]) + def test_invalid_variable_label_encoding(self, version, mixed_frame): + mixed_frame.index.name = "index" + variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} variable_labels["a"] = "invalid character Œ" with tm.ensure_clean() as path: with pytest.raises( ValueError, match="Variable labels must contain only characters" ): - original.to_stata( + mixed_frame.to_stata( path, variable_labels=variable_labels, version=version ) - def test_write_variable_label_errors(self): - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) + def test_write_variable_label_errors(self, mixed_frame): values = ["\u03A1", "\u0391", "\u039D", "\u0394", "\u0391", "\u03A3"] variable_labels_utf8 = { @@ -1301,7 +1295,7 @@ def test_write_variable_label_errors(self): ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels_utf8) + mixed_frame.to_stata(path, variable_labels=variable_labels_utf8) variable_labels_long = { "a": "City Rank", @@ -1314,7 +1308,7 @@ def test_write_variable_label_errors(self): msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels_long) + mixed_frame.to_stata(path, variable_labels=variable_labels_long) def test_default_date_conversion(self): # GH 12259 @@ -1636,7 +1630,7 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version): # GH 21041 bio = io.BytesIO() @@ -1699,7 +1693,7 @@ def test_mixed_string_strl(self): expected = output.fillna("") tm.assert_frame_equal(reread, expected) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_all_none_exception(self, version): output = [{"none": "none", "number": 0}, {"none": None, "number": 1}] output = pd.DataFrame(output) @@ -1708,7 +1702,7 @@ def test_all_none_exception(self, version): with pytest.raises(ValueError, match="Column `none` cannot be exported"): output.to_stata(path, version=version) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_invalid_file_not_written(self, version): content = "Here is one __�__ Another one __·__ Another one __½__" df = DataFrame([content], columns=["invalid"]) @@ -1770,7 +1764,8 @@ def test_stata_119(self): assert df.iloc[0, -1] == 1 assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) - def test_118_writer(self): + @pytest.mark.parametrize("version", [118, 119, None]) + def test_utf8_writer(self, version): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = pd.DataFrame( [ @@ -1791,13 +1786,14 @@ def test_118_writer(self): data_label = "ᴅaᵀa-label" data["β"] = data["β"].astype(np.int32) with tm.ensure_clean() as path: - writer = StataWriter118( + writer = StataWriterUTF8( path, data, data_label=data_label, convert_strl=["strls"], variable_labels=variable_labels, write_index=False, + version=version, ) writer.write_file() reread_encoded = read_stata(path) @@ -1807,3 +1803,16 @@ def test_118_writer(self): reader = StataReader(path) assert reader.data_label == data_label assert reader.variable_labels() == variable_labels + + data.to_stata(path, version=version, write_index=False) + reread_to_stata = read_stata(path) + tm.assert_frame_equal(data, reread_to_stata) + + def test_writer_118_exceptions(self): + df = DataFrame(np.zeros((1, 33000), dtype=np.int8)) + with tm.ensure_clean() as path: + with pytest.raises(ValueError, match="version must be either 118 or 119."): + StataWriterUTF8(path, df, version=117) + with tm.ensure_clean() as path: + with pytest.raises(ValueError, match="You must use version 119"): + StataWriterUTF8(path, df, version=118)