Skip to content

Commit cc999d9

Browse files
committed
PERF: Improve performance of StataReader
Improve performance of StataReader when converting columns with missing values xref #25772
1 parent c8ba766 commit cc999d9

File tree

2 files changed

+14
-7
lines changed

2 files changed

+14
-7
lines changed

doc/source/whatsnew/v0.25.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ I/O
235235
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
236236
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
237237
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
238-
-
238+
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
239239

240240

241241
Plotting

pandas/io/stata.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
from pandas.core.dtypes.common import (
3232
ensure_object, is_categorical_dtype, is_datetime64_dtype)
3333

34-
from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
34+
from pandas import (DatetimeIndex, compat, isna, to_datetime, to_timedelta,
35+
concat)
3536
from pandas.core.arrays import Categorical
3637
from pandas.core.base import StringMixin
3738
from pandas.core.frame import DataFrame
@@ -1572,7 +1573,7 @@ def read(self, nrows=None, convert_dates=None,
15721573
data = DataFrame.from_dict(OrderedDict(data_formatted))
15731574
del data_formatted
15741575

1575-
self._do_convert_missing(data, convert_missing)
1576+
data = self._do_convert_missing(data, convert_missing)
15761577

15771578
if convert_dates:
15781579
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
@@ -1616,7 +1617,7 @@ def read(self, nrows=None, convert_dates=None,
16161617

16171618
def _do_convert_missing(self, data, convert_missing):
16181619
# Check for missing values, and replace if found
1619-
1620+
replacements = {}
16201621
for i, colname in enumerate(data):
16211622
fmt = self.typlist[i]
16221623
if fmt not in self.VALID_RANGE:
@@ -1646,8 +1647,14 @@ def _do_convert_missing(self, data, convert_missing):
16461647
dtype = np.float64
16471648
replacement = Series(series, dtype=dtype)
16481649
replacement[missing] = np.nan
1649-
1650-
data[colname] = replacement
1650+
replacements[colname] = replacement
1651+
if replacements:
1652+
columns = data.columns
1653+
replacements = DataFrame(replacements)
1654+
data.drop(replacements.columns, 1, inplace=True)
1655+
data = concat([data, replacements], 1)
1656+
data = data[columns]
1657+
return data
16511658

16521659
def _insert_strls(self, data):
16531660
if not hasattr(self, 'GSO') or len(self.GSO) == 0:
@@ -1712,7 +1719,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
17121719
except ValueError:
17131720
vc = Series(categories).value_counts()
17141721
repeats = list(vc.index[vc > 1])
1715-
repeats = '\n' + '-' * 80 + '\n'.join(repeats)
1722+
repeats = '\n' + '-' * 80 + '\n' + '\n'.join(repeats)
17161723
raise ValueError('Value labels for column {col} are not '
17171724
'unique. The repeated labels are:\n'
17181725
'{repeats}'

0 commit comments

Comments
 (0)