diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e621ab2a5b9c5..07f5b01709223 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,6 +544,7 @@ I/O - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 9c6cd2faeaa2f..d1e57ad568ba5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3037,6 +3037,8 @@ def __init__( if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) + # Flag whether chosen byteorder matches the system on which we're running + self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder) gso_v_type = "I" # uint32 gso_o_type = "Q" # uint64 @@ -3049,13 +3051,20 @@ def __init__( o_size = 6 else: # version == 119 o_size = 5 - self._o_offet = 2 ** (8 * (8 - o_size)) + if self._native_byteorder: + self._o_offet = 2 ** (8 * (8 - o_size)) + else: + self._o_offet = 2 ** (8 * o_size) self._gso_o_type = gso_o_type self._gso_v_type = gso_v_type def _convert_key(self, key: tuple[int, int]) -> int: v, o = key - return v + self._o_offet * o + if self._native_byteorder: + return v + self._o_offet * o + else: + # v, o will be swapped when applying byteorder + return o + self._o_offet * v def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: """ @@ -3532,7 +3541,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) + ssw = StataStrLWriter( + data, convert_cols, version=self._dta_version, byteorder=self._byteorder + ) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index bf17e62985fe9..2534df6a82f89 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1678,7 +1678,8 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - def test_writer_117(self, temp_file): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_writer_117(self, byteorder, temp_file): original = DataFrame( data=[ [ @@ -1736,6 +1737,7 @@ def test_writer_117(self, temp_file): original.to_stata( path, convert_dates={"datetime": "tc"}, + byteorder=byteorder, convert_strl=["forced_strl"], version=117, ) @@ -1940,7 +1942,8 @@ def test_stata_119(self, datapath): assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) - def test_utf8_writer(self, version, temp_file): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_utf8_writer(self, version, byteorder, temp_file): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = DataFrame( [ @@ -1968,6 +1971,7 @@ def test_utf8_writer(self, version, temp_file): convert_strl=["strls"], variable_labels=variable_labels, write_index=False, + byteorder=byteorder, version=version, value_labels=value_labels, )