Skip to content

Commit 9cb969e

Browse files
authored
PERF: read_stata (#43277)
* PERF: read_stata * fix ArrayManager
1 parent 303fc9a commit 9cb969e

File tree

1 file changed

+8
-10
lines changed

1 file changed

+8
-10
lines changed

pandas/io/stata.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
DatetimeIndex,
5454
NaT,
5555
Timestamp,
56-
concat,
5756
isna,
5857
to_datetime,
5958
to_timedelta,
@@ -1663,7 +1662,7 @@ def read(
16631662
# restarting at 0 for each chunk.
16641663
if index_col is None:
16651664
ix = np.arange(self._lines_read - read_lines, self._lines_read)
1666-
data = data.set_index(ix)
1665+
data.index = ix # set attr instead of set_index to avoid copy
16671666

16681667
if columns is not None:
16691668
try:
@@ -1779,19 +1778,18 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
17791778
if dtype not in (np.float32, np.float64):
17801779
dtype = np.float64
17811780
replacement = Series(series, dtype=dtype)
1781+
if not replacement._values.flags["WRITEABLE"]:
1782+
# only relevant for ArrayManager; construction
1783+
# path for BlockManager ensures writeability
1784+
replacement = replacement.copy()
17821785
# Note: operating on ._values is much faster than directly
17831786
# TODO: can we fix that?
17841787
replacement._values[missing] = np.nan
17851788
replacements[colname] = replacement
1789+
17861790
if replacements:
1787-
columns = data.columns
1788-
replacement_df = DataFrame(replacements, copy=False)
1789-
replaced = concat(
1790-
[data.drop(replacement_df.columns, axis=1), replacement_df],
1791-
axis=1,
1792-
copy=False,
1793-
)
1794-
data = replaced[columns]
1791+
for col in replacements:
1792+
data[col] = replacements[col]
17951793
return data
17961794

17971795
def _insert_strls(self, data: DataFrame) -> DataFrame:

0 commit comments

Comments
 (0)