Skip to content

Commit 37a5fa5

Browse files
kinowproost
authored andcommitted
BUG: Set na_rep values before converting to string to prevent data truncation (pandas-dev#25103)
* Fix pandas-dev#25099 set na_rep values before converting to string to prevent data truncation
1 parent 8599617 commit 37a5fa5

File tree

4 files changed

+29
-6
lines changed

4 files changed

+29
-6
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ I/O
183183
- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`)
184184
- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`)
185185
- Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`)
186+
- Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`)
186187

187188
Plotting
188189
^^^^^^^^

pandas/_libs/writers.pyx

+13-4
Original file line numberDiff line numberDiff line change
@@ -120,17 +120,26 @@ def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t:
120120

121121
for i in range(length):
122122
val = arr[i]
123-
if isinstance(val, str):
124-
l = PyUnicode_GET_SIZE(val)
125-
elif isinstance(val, bytes):
126-
l = PyBytes_GET_SIZE(val)
123+
l = word_len(val)
127124

128125
if l > m:
129126
m = l
130127

131128
return m
132129

133130

131+
cpdef inline Py_ssize_t word_len(object val):
132+
""" return the maximum length of a string or bytes value """
133+
cdef:
134+
Py_ssize_t l = 0
135+
136+
if isinstance(val, str):
137+
l = PyUnicode_GET_SIZE(val)
138+
elif isinstance(val, bytes):
139+
l = PyBytes_GET_SIZE(val)
140+
141+
return l
142+
134143
# ------------------------------------------------------------------
135144
# PyTables Helpers
136145

pandas/core/internals/blocks.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import numpy as np
99

10-
from pandas._libs import NaT, Timestamp, lib, tslib
10+
from pandas._libs import NaT, Timestamp, lib, tslib, writers
1111
import pandas._libs.internals as libinternals
1212
from pandas._libs.tslibs import Timedelta, conversion
1313
from pandas._libs.tslibs.timezones import tz_compare
@@ -706,7 +706,8 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
706706
mask = isna(values)
707707

708708
if not self.is_object and not quoting:
709-
values = values.astype(str)
709+
itemsize = writers.word_len(na_rep)
710+
values = values.astype("<U{size}".format(size=itemsize))
710711
else:
711712
values = np.array(values, dtype="object")
712713

pandas/tests/io/formats/test_to_csv.py

+12
Original file line numberDiff line numberDiff line change
@@ -555,3 +555,15 @@ def test_to_csv_zip_arguments(self, compression, archive_name):
555555
assert len(zp.filelist) == 1
556556
archived_file = os.path.basename(zp.filelist[0].filename)
557557
assert archived_file == expected_arcname
558+
559+
@pytest.mark.parametrize("df_new_type", ["Int64"])
560+
def test_to_csv_na_rep_long_string(self, df_new_type):
561+
# see gh-25099
562+
df = pd.DataFrame({"c": [float("nan")] * 3})
563+
df = df.astype(df_new_type)
564+
expected_rows = ["c", "mynull", "mynull", "mynull"]
565+
expected = tm.convert_rows_list_to_csv_str(expected_rows)
566+
567+
result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")
568+
569+
assert expected == result

0 commit comments

Comments
 (0)