BUG: Set na_rep values before converting to string to prevent data truncation (pandas-dev#25103)

kinow · proost · commit 37a5fa5af8c5 · 2019-12-20T01:21:38.000+09:00
* Fix pandas-dev#25099 set na_rep values before converting to string to prevent data truncation
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -183,6 +183,7 @@ I/O
 - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`)
 - Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`)
 - Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`)
+- Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx
@@ -120,17 +120,26 @@ def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t:
 
     for i in range(length):
         val = arr[i]
-        if isinstance(val, str):
-            l = PyUnicode_GET_SIZE(val)
-        elif isinstance(val, bytes):
-            l = PyBytes_GET_SIZE(val)
+        l = word_len(val)
 
         if l > m:
             m = l
 
     return m
 
 
+cpdef inline Py_ssize_t word_len(object val):
+    """ return the maximum length of a string or bytes value """
+    cdef:
+        Py_ssize_t l = 0
+
+    if isinstance(val, str):
+        l = PyUnicode_GET_SIZE(val)
+    elif isinstance(val, bytes):
+        l = PyBytes_GET_SIZE(val)
+
+    return l
+
 # ------------------------------------------------------------------
 # PyTables Helpers
 
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 
-from pandas._libs import NaT, Timestamp, lib, tslib
+from pandas._libs import NaT, Timestamp, lib, tslib, writers
 import pandas._libs.internals as libinternals
 from pandas._libs.tslibs import Timedelta, conversion
 from pandas._libs.tslibs.timezones import tz_compare
@@ -706,7 +706,8 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
         mask = isna(values)
 
         if not self.is_object and not quoting:
-            values = values.astype(str)
+            itemsize = writers.word_len(na_rep)
+            values = values.astype("<U{size}".format(size=itemsize))
         else:
             values = np.array(values, dtype="object")
 
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -555,3 +555,15 @@ def test_to_csv_zip_arguments(self, compression, archive_name):
             assert len(zp.filelist) == 1
             archived_file = os.path.basename(zp.filelist[0].filename)
             assert archived_file == expected_arcname
+
+    @pytest.mark.parametrize("df_new_type", ["Int64"])
+    def test_to_csv_na_rep_long_string(self, df_new_type):
+        # see gh-25099
+        df = pd.DataFrame({"c": [float("nan")] * 3})
+        df = df.astype(df_new_type)
+        expected_rows = ["c", "mynull", "mynull", "mynull"]
+        expected = tm.convert_rows_list_to_csv_str(expected_rows)
+
+        result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")
+
+        assert expected == result