Skip to content

Commit ba7232c

Browse files
committed
PERF: Improve performance of StataReader
Improve performance of StataReader when converting columns with missing values xref #25772
1 parent c8ba766 commit ba7232c

File tree

3 files changed

+27
-10
lines changed

3 files changed

+27
-10
lines changed

asv_bench/benchmarks/io/stata.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@ class Stata(BaseIO):
1212

1313
def setup(self, convert_dates):
1414
self.fname = '__test__.dta'
15-
N = 100000
16-
C = 5
15+
N = self.N = 100000
16+
C = self.C = 5
1717
self.df = DataFrame(np.random.randn(N, C),
1818
columns=['float{}'.format(i) for i in range(C)],
1919
index=date_range('20000101', periods=N, freq='H'))
20-
self.df['object'] = tm.makeStringIndex(N)
20+
self.df['object'] = tm.makeStringIndex(self.N)
2121
self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
2222
np.iinfo(np.int8).max - 27, N)
2323
self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
@@ -36,4 +36,14 @@ def time_write_stata(self, convert_dates):
3636
self.df.to_stata(self.fname, self.convert_dates)
3737

3838

39+
class StataMissing(Stata):
40+
def setup(self, convert_dates):
41+
super(StataMissing, self).setup(convert_dates)
42+
for i in range(100):
43+
missing_data = np.random.randn(self.N, self.C)
44+
missing_data[missing_data < 0] = np.nan
45+
self.df['missing_'.format(i)] = missing_data
46+
self.df.to_stata(self.fname, self.convert_dates)
47+
48+
3949
from ..pandas_vb_common import setup # noqa: F401

doc/source/whatsnew/v0.25.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ I/O
235235
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
236236
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
237237
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
238-
-
238+
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
239239

240240

241241
Plotting

pandas/io/stata.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
from pandas.core.dtypes.common import (
3232
ensure_object, is_categorical_dtype, is_datetime64_dtype)
3333

34-
from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
34+
from pandas import (DatetimeIndex, compat, isna, to_datetime, to_timedelta,
35+
concat)
3536
from pandas.core.arrays import Categorical
3637
from pandas.core.base import StringMixin
3738
from pandas.core.frame import DataFrame
@@ -1572,7 +1573,7 @@ def read(self, nrows=None, convert_dates=None,
15721573
data = DataFrame.from_dict(OrderedDict(data_formatted))
15731574
del data_formatted
15741575

1575-
self._do_convert_missing(data, convert_missing)
1576+
data = self._do_convert_missing(data, convert_missing)
15761577

15771578
if convert_dates:
15781579
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
@@ -1616,7 +1617,7 @@ def read(self, nrows=None, convert_dates=None,
16161617

16171618
def _do_convert_missing(self, data, convert_missing):
16181619
# Check for missing values, and replace if found
1619-
1620+
replacements = {}
16201621
for i, colname in enumerate(data):
16211622
fmt = self.typlist[i]
16221623
if fmt not in self.VALID_RANGE:
@@ -1646,8 +1647,14 @@ def _do_convert_missing(self, data, convert_missing):
16461647
dtype = np.float64
16471648
replacement = Series(series, dtype=dtype)
16481649
replacement[missing] = np.nan
1649-
1650-
data[colname] = replacement
1650+
replacements[colname] = replacement
1651+
if replacements:
1652+
columns = data.columns
1653+
replacements = DataFrame(replacements)
1654+
data = concat([data.drop(replacements.columns, 1),
1655+
replacements], 1)
1656+
data = data[columns]
1657+
return data
16511658

16521659
def _insert_strls(self, data):
16531660
if not hasattr(self, 'GSO') or len(self.GSO) == 0:
@@ -1712,7 +1719,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
17121719
except ValueError:
17131720
vc = Series(categories).value_counts()
17141721
repeats = list(vc.index[vc > 1])
1715-
repeats = '\n' + '-' * 80 + '\n'.join(repeats)
1722+
repeats = '\n' + '-' * 80 + '\n' + '\n'.join(repeats)
17161723
raise ValueError('Value labels for column {col} are not '
17171724
'unique. The repeated labels are:\n'
17181725
'{repeats}'

0 commit comments

Comments
 (0)