PERF: use new to_records() argument in to_stata() (pandas-dev#25045)

qwhelan · Pingviinituutti · commit 77b864d82461 · 2019-02-28T10:26:56.000+02:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -23,14 +23,6 @@ Other Enhancements
 -
 -
 
-.. _whatsnew_0250.performance:
-
-Performance Improvements
-~~~~~~~~~~~~~~~~~~~~~~~~
- - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
-
-
-
 .. _whatsnew_0250.api_breaking:
 
 Backwards incompatible API changes
@@ -69,8 +61,8 @@ Removal of prior version deprecations/changes
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
--
--
+- Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
+- `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`)
 -
 
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -2385,32 +2385,22 @@ def _prepare_data(self):
         data = self._convert_strls(data)
 
         # 3. Convert bad string data to '' and pad to correct length
-        dtypes = []
-        data_cols = []
-        has_strings = False
+        dtypes = {}
         native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
         for i, col in enumerate(data):
             typ = typlist[i]
             if typ <= self._max_string_length:
-                has_strings = True
                 data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,))
                 stype = 'S{type}'.format(type=typ)
-                dtypes.append(('c' + str(i), stype))
-                string = data[col].str.encode(self._encoding)
-                data_cols.append(string.values.astype(stype))
+                dtypes[col] = stype
+                data[col] = data[col].str.encode(self._encoding).astype(stype)
             else:
-                values = data[col].values
                 dtype = data[col].dtype
                 if not native_byteorder:
                     dtype = dtype.newbyteorder(self._byteorder)
-                dtypes.append(('c' + str(i), dtype))
-                data_cols.append(values)
-        dtypes = np.dtype(dtypes)
+                dtypes[col] = dtype
 
-        if has_strings or not native_byteorder:
-            self.data = np.fromiter(zip(*data_cols), dtype=dtypes)
-        else:
-            self.data = data.to_records(index=False)
+        self.data = data.to_records(index=False, column_dtypes=dtypes)
 
     def _write_data(self):
         data = self.data