Skip to content

Commit 77b864d

Browse files
qwhelanPingviinituutti
authored andcommitted
PERF: use new to_records() argument in to_stata() (pandas-dev#25045)
1 parent e9e97c1 commit 77b864d

File tree

2 files changed

+7
-25
lines changed

2 files changed

+7
-25
lines changed

doc/source/whatsnew/v0.25.0.rst

+2-10
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,6 @@ Other Enhancements
2323
-
2424
-
2525

26-
.. _whatsnew_0250.performance:
27-
28-
Performance Improvements
29-
~~~~~~~~~~~~~~~~~~~~~~~~
30-
- Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
31-
32-
33-
3426
.. _whatsnew_0250.api_breaking:
3527

3628
Backwards incompatible API changes
@@ -69,8 +61,8 @@ Removal of prior version deprecations/changes
6961
Performance Improvements
7062
~~~~~~~~~~~~~~~~~~~~~~~~
7163

72-
-
73-
-
64+
- Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
65+
- `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`)
7466
-
7567

7668

pandas/io/stata.py

+5-15
Original file line numberDiff line numberDiff line change
@@ -2385,32 +2385,22 @@ def _prepare_data(self):
23852385
data = self._convert_strls(data)
23862386

23872387
# 3. Convert bad string data to '' and pad to correct length
2388-
dtypes = []
2389-
data_cols = []
2390-
has_strings = False
2388+
dtypes = {}
23912389
native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
23922390
for i, col in enumerate(data):
23932391
typ = typlist[i]
23942392
if typ <= self._max_string_length:
2395-
has_strings = True
23962393
data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,))
23972394
stype = 'S{type}'.format(type=typ)
2398-
dtypes.append(('c' + str(i), stype))
2399-
string = data[col].str.encode(self._encoding)
2400-
data_cols.append(string.values.astype(stype))
2395+
dtypes[col] = stype
2396+
data[col] = data[col].str.encode(self._encoding).astype(stype)
24012397
else:
2402-
values = data[col].values
24032398
dtype = data[col].dtype
24042399
if not native_byteorder:
24052400
dtype = dtype.newbyteorder(self._byteorder)
2406-
dtypes.append(('c' + str(i), dtype))
2407-
data_cols.append(values)
2408-
dtypes = np.dtype(dtypes)
2401+
dtypes[col] = dtype
24092402

2410-
if has_strings or not native_byteorder:
2411-
self.data = np.fromiter(zip(*data_cols), dtype=dtypes)
2412-
else:
2413-
self.data = data.to_records(index=False)
2403+
self.data = data.to_records(index=False, column_dtypes=dtypes)
24142404

24152405
def _write_data(self):
24162406
data = self.data

0 commit comments

Comments
 (0)