Skip to content

Commit e5f3c06

Browse files
committed
PERF: Improve performance of StataReader
Improve performance of StataReader when converting columns with missing values xref #25772
1 parent 27aa9d8 commit e5f3c06

File tree

4 files changed

+28
-11
lines changed

4 files changed

+28
-11
lines changed

asv_bench/benchmarks/io/stata.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@ class Stata(BaseIO):
1212

1313
def setup(self, convert_dates):
1414
self.fname = '__test__.dta'
15-
N = 100000
16-
C = 5
15+
N = self.N = 100000
16+
C = self.C = 5
1717
self.df = DataFrame(np.random.randn(N, C),
1818
columns=['float{}'.format(i) for i in range(C)],
1919
index=date_range('20000101', periods=N, freq='H'))
20-
self.df['object'] = tm.makeStringIndex(N)
20+
self.df['object'] = tm.makeStringIndex(self.N)
2121
self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
2222
np.iinfo(np.int8).max - 27, N)
2323
self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
@@ -36,4 +36,14 @@ def time_write_stata(self, convert_dates):
3636
self.df.to_stata(self.fname, self.convert_dates)
3737

3838

39+
class StataMissing(Stata):
40+
def setup(self, convert_dates):
41+
super(StataMissing, self).setup(convert_dates)
42+
for i in range(10):
43+
missing_data = np.random.randn(self.N)
44+
missing_data[missing_data < 0] = np.nan
45+
self.df['missing_{0}'.format(i)] = missing_data
46+
self.df.to_stata(self.fname, self.convert_dates)
47+
48+
3949
from ..pandas_vb_common import setup # noqa: F401

doc/source/whatsnew/v0.25.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ I/O
278278
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
279279
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
280280
- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
281-
-
281+
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
282282

283283

284284
Plotting

pandas/io/stata.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
from pandas.core.dtypes.common import (
3232
ensure_object, is_categorical_dtype, is_datetime64_dtype)
3333

34-
from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
34+
from pandas import (
35+
DatetimeIndex, compat, concat, isna, to_datetime, to_timedelta)
3536
from pandas.core.arrays import Categorical
3637
from pandas.core.base import StringMixin
3738
from pandas.core.frame import DataFrame
@@ -1572,7 +1573,7 @@ def read(self, nrows=None, convert_dates=None,
15721573
data = DataFrame.from_dict(OrderedDict(data_formatted))
15731574
del data_formatted
15741575

1575-
self._do_convert_missing(data, convert_missing)
1576+
data = self._do_convert_missing(data, convert_missing)
15761577

15771578
if convert_dates:
15781579
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
@@ -1616,7 +1617,7 @@ def read(self, nrows=None, convert_dates=None,
16161617

16171618
def _do_convert_missing(self, data, convert_missing):
16181619
# Check for missing values, and replace if found
1619-
1620+
replacements = {}
16201621
for i, colname in enumerate(data):
16211622
fmt = self.typlist[i]
16221623
if fmt not in self.VALID_RANGE:
@@ -1646,8 +1647,14 @@ def _do_convert_missing(self, data, convert_missing):
16461647
dtype = np.float64
16471648
replacement = Series(series, dtype=dtype)
16481649
replacement[missing] = np.nan
1649-
1650-
data[colname] = replacement
1650+
replacements[colname] = replacement
1651+
if replacements:
1652+
columns = data.columns
1653+
replacements = DataFrame(replacements)
1654+
data = concat([data.drop(replacements.columns, 1),
1655+
replacements], 1)
1656+
data = data[columns]
1657+
return data
16511658

16521659
def _insert_strls(self, data):
16531660
if not hasattr(self, 'GSO') or len(self.GSO) == 0:
@@ -1712,7 +1719,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
17121719
except ValueError:
17131720
vc = Series(categories).value_counts()
17141721
repeats = list(vc.index[vc > 1])
1715-
repeats = '\n' + '-' * 80 + '\n'.join(repeats)
1722+
repeats = '-' * 80 + '\n' + '\n'.join(repeats)
17161723
raise ValueError('Value labels for column {col} are not '
17171724
'unique. The repeated labels are:\n'
17181725
'{repeats}'

pandas/tests/io/test_stata.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1311,7 +1311,7 @@ def test_unsupported_datetype(self):
13111311
def test_repeated_column_labels(self):
13121312
# GH 13923
13131313
msg = (r"Value labels for column ethnicsn are not unique\. The"
1314-
r" repeated labels are:\n\n-+wolof")
1314+
r" repeated labels are:\n-+\nwolof")
13151315
with pytest.raises(ValueError, match=msg):
13161316
read_stata(self.dta23, convert_categoricals=True)
13171317

0 commit comments

Comments
 (0)