Skip to content

BUG: byteorder option in to_stata is not honoured when writing strL data #58970

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ I/O
- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`)
- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
Expand Down
17 changes: 14 additions & 3 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3037,6 +3037,8 @@ def __init__(
if byteorder is None:
byteorder = sys.byteorder
self._byteorder = _set_endianness(byteorder)
# Flag whether chosen byteorder matches the system on which we're running
self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder)

gso_v_type = "I" # uint32
gso_o_type = "Q" # uint64
Expand All @@ -3049,13 +3051,20 @@ def __init__(
o_size = 6
else: # version == 119
o_size = 5
self._o_offet = 2 ** (8 * (8 - o_size))
if self._native_byteorder:
self._o_offet = 2 ** (8 * (8 - o_size))
else:
self._o_offet = 2 ** (8 * o_size)
self._gso_o_type = gso_o_type
self._gso_v_type = gso_v_type

def _convert_key(self, key: tuple[int, int]) -> int:
v, o = key
return v + self._o_offet * o
if self._native_byteorder:
return v + self._o_offet * o
else:
# v, o will be swapped when applying byteorder
return o + self._o_offet * v

def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:
"""
Expand Down Expand Up @@ -3532,7 +3541,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame:
]

if convert_cols:
ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)
ssw = StataStrLWriter(
data, convert_cols, version=self._dta_version, byteorder=self._byteorder
)
tab, new_data = ssw.generate_table()
data = new_data
self._strl_blob = ssw.generate_blob(tab)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1678,7 +1678,8 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
formatted = df.loc[0, column + "_fmt"]
assert unformatted == formatted

def test_writer_117(self, temp_file):
@pytest.mark.parametrize("byteorder", ["little", "big"])
def test_writer_117(self, byteorder, temp_file):
original = DataFrame(
data=[
[
Expand Down Expand Up @@ -1736,6 +1737,7 @@ def test_writer_117(self, temp_file):
original.to_stata(
path,
convert_dates={"datetime": "tc"},
byteorder=byteorder,
convert_strl=["forced_strl"],
version=117,
)
Expand Down Expand Up @@ -1940,7 +1942,8 @@ def test_stata_119(self, datapath):
assert reader._nvar == 32999

@pytest.mark.parametrize("version", [118, 119, None])
def test_utf8_writer(self, version, temp_file):
@pytest.mark.parametrize("byteorder", ["little", "big"])
def test_utf8_writer(self, version, byteorder, temp_file):
cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
data = DataFrame(
[
Expand Down Expand Up @@ -1968,6 +1971,7 @@ def test_utf8_writer(self, version, temp_file):
convert_strl=["strls"],
variable_labels=variable_labels,
write_index=False,
byteorder=byteorder,
version=version,
value_labels=value_labels,
)
Expand Down