Skip to content

Commit 629ffeb

Browse files
authored
BUG: byteorder option in to_stata is not honoured when writing strL data (#58970)
* BUG: byteorder option in to_stata is not honoured when writing strL data * Check whether requested byteorder matches the current system once, and store the result
1 parent b290bf0 commit 629ffeb

File tree

3 files changed

+21
-5
lines changed

3 files changed

+21
-5
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,7 @@ I/O
544544
- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`)
545545
- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
546546
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
547+
- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
547548
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
548549
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
549550
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

pandas/io/stata.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -3037,6 +3037,8 @@ def __init__(
30373037
if byteorder is None:
30383038
byteorder = sys.byteorder
30393039
self._byteorder = _set_endianness(byteorder)
3040+
# Flag whether chosen byteorder matches the system on which we're running
3041+
self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
30403042

30413043
gso_v_type = "I" # uint32
30423044
gso_o_type = "Q" # uint64
@@ -3049,13 +3051,20 @@ def __init__(
30493051
o_size = 6
30503052
else: # version == 119
30513053
o_size = 5
3052-
self._o_offet = 2 ** (8 * (8 - o_size))
3054+
if self._native_byteorder:
3055+
self._o_offet = 2 ** (8 * (8 - o_size))
3056+
else:
3057+
self._o_offet = 2 ** (8 * o_size)
30533058
self._gso_o_type = gso_o_type
30543059
self._gso_v_type = gso_v_type
30553060

30563061
def _convert_key(self, key: tuple[int, int]) -> int:
30573062
v, o = key
3058-
return v + self._o_offet * o
3063+
if self._native_byteorder:
3064+
return v + self._o_offet * o
3065+
else:
3066+
# v, o will be swapped when applying byteorder
3067+
return o + self._o_offet * v
30593068

30603069
def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:
30613070
"""
@@ -3532,7 +3541,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame:
35323541
]
35333542

35343543
if convert_cols:
3535-
ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)
3544+
ssw = StataStrLWriter(
3545+
data, convert_cols, version=self._dta_version, byteorder=self._byteorder
3546+
)
35363547
tab, new_data = ssw.generate_table()
35373548
data = new_data
35383549
self._strl_blob = ssw.generate_blob(tab)

pandas/tests/io/test_stata.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1678,7 +1678,8 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
16781678
formatted = df.loc[0, column + "_fmt"]
16791679
assert unformatted == formatted
16801680

1681-
def test_writer_117(self, temp_file):
1681+
@pytest.mark.parametrize("byteorder", ["little", "big"])
1682+
def test_writer_117(self, byteorder, temp_file):
16821683
original = DataFrame(
16831684
data=[
16841685
[
@@ -1736,6 +1737,7 @@ def test_writer_117(self, temp_file):
17361737
original.to_stata(
17371738
path,
17381739
convert_dates={"datetime": "tc"},
1740+
byteorder=byteorder,
17391741
convert_strl=["forced_strl"],
17401742
version=117,
17411743
)
@@ -1940,7 +1942,8 @@ def test_stata_119(self, datapath):
19401942
assert reader._nvar == 32999
19411943

19421944
@pytest.mark.parametrize("version", [118, 119, None])
1943-
def test_utf8_writer(self, version, temp_file):
1945+
@pytest.mark.parametrize("byteorder", ["little", "big"])
1946+
def test_utf8_writer(self, version, byteorder, temp_file):
19441947
cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
19451948
data = DataFrame(
19461949
[
@@ -1968,6 +1971,7 @@ def test_utf8_writer(self, version, temp_file):
19681971
convert_strl=["strls"],
19691972
variable_labels=variable_labels,
19701973
write_index=False,
1974+
byteorder=byteorder,
19711975
version=version,
19721976
value_labels=value_labels,
19731977
)

0 commit comments

Comments
 (0)