Skip to content

Commit 489c394

Browse files
committed
Merge pull request #8073 from bashtage/stata-read-batching
PERF: StataReader is slow
2 parents 2594bea + 14b28a1 commit 489c394

File tree

6 files changed

+232
-85
lines changed

6 files changed

+232
-85
lines changed

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ Performance
481481
- Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`)
482482
- Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`)
483483
- Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`)
484-
- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`)
484+
- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`, :issue:`8073`)
485485

486486

487487

pandas/io/stata.py

+184-79
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,18 @@
1313

1414
import sys
1515
import struct
16+
from dateutil.relativedelta import relativedelta
1617
from pandas.core.base import StringMixin
1718
from pandas.core.frame import DataFrame
1819
from pandas.core.series import Series
1920
from pandas.core.categorical import Categorical
2021
import datetime
21-
from pandas import compat
22+
from pandas import compat, to_timedelta, to_datetime
2223
from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \
2324
zip
24-
from pandas import isnull
2525
from pandas.io.common import get_filepath_or_buffer
2626
from pandas.lib import max_len_string_array, is_string_array
27-
from pandas.tslib import NaT
27+
from pandas.tslib import NaT, Timestamp
2828

2929
def read_stata(filepath_or_buffer, convert_dates=True,
3030
convert_categoricals=True, encoding=None, index=None,
@@ -62,6 +62,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
6262
_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
6363

6464

65+
stata_epoch = datetime.datetime(1960, 1, 1)
6566
def _stata_elapsed_date_to_datetime(date, fmt):
6667
"""
6768
Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
@@ -111,9 +112,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
111112
#TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
112113
if np.isnan(date):
113114
return NaT
114-
115115
date = int(date)
116-
stata_epoch = datetime.datetime(1960, 1, 1)
117116
if fmt in ["%tc", "tc"]:
118117
from dateutil.relativedelta import relativedelta
119118
return stata_epoch + relativedelta(microseconds=date * 1000)
@@ -148,6 +147,158 @@ def _stata_elapsed_date_to_datetime(date, fmt):
148147
raise ValueError("Date fmt %s not understood" % fmt)
149148

150149

150+
def _stata_elapsed_date_to_datetime_vec(dates, fmt):
151+
"""
152+
Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
153+
154+
Parameters
155+
----------
156+
dates : array-like
157+
The Stata Internal Format date to convert to datetime according to fmt
158+
fmt : str
159+
The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
160+
Returns
161+
162+
Returns
163+
-------
164+
converted : Series
165+
The converted dates
166+
167+
Examples
168+
--------
169+
>>> _stata_elapsed_date_to_datetime(52, "%tw")
170+
datetime.datetime(1961, 1, 1, 0, 0)
171+
172+
Notes
173+
-----
174+
datetime/c - tc
175+
milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
176+
datetime/C - tC - NOT IMPLEMENTED
177+
milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
178+
date - td
179+
days since 01jan1960 (01jan1960 = 0)
180+
weekly date - tw
181+
weeks since 1960w1
182+
This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
183+
The datetime value is the start of the week in terms of days in the
184+
year, not ISO calendar weeks.
185+
monthly date - tm
186+
months since 1960m1
187+
quarterly date - tq
188+
quarters since 1960q1
189+
half-yearly date - th
190+
half-years since 1960h1 yearly
191+
date - ty
192+
years since 0000
193+
194+
If you don't have pandas with datetime support, then you can't do
195+
milliseconds accurately.
196+
"""
197+
MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
198+
MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days
199+
MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days
200+
MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
201+
MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
202+
203+
def convert_year_month_safe(year, month):
204+
"""
205+
Convert year and month to datetimes, using pandas vectorized versions
206+
when the date range falls within the range supported by pandas. Other
207+
wise it falls back to a slower but more robust method using datetime.
208+
"""
209+
if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
210+
return to_datetime(100 * year + month, format='%Y%m')
211+
else:
212+
return Series(
213+
[datetime.datetime(y, m, 1) for y, m in zip(year, month)])
214+
215+
def convert_year_days_safe(year, days):
216+
"""
217+
Converts year (e.g. 1999) and days since the start of the year to a
218+
datetime or datetime64 Series
219+
"""
220+
if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
221+
return to_datetime(year, format='%Y') + to_timedelta(days, unit='d')
222+
else:
223+
value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) for
224+
y, d in zip(year, days)]
225+
return Series(value)
226+
227+
def convert_delta_safe(base, deltas, unit):
228+
"""
229+
Convert base dates and deltas to datetimes, using pandas vectorized
230+
versions if the deltas satisfy restrictions required to be expressed
231+
as dates in pandas.
232+
"""
233+
if unit == 'd':
234+
if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
235+
values = [base + relativedelta(days=int(d)) for d in deltas]
236+
return Series(values)
237+
elif unit == 'ms':
238+
if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
239+
values = [base + relativedelta(microseconds=(int(d) * 1000)) for
240+
d in deltas]
241+
return Series(values)
242+
else:
243+
raise ValueError('format not understood')
244+
245+
base = to_datetime(base)
246+
deltas = to_timedelta(deltas, unit=unit)
247+
return base + deltas
248+
249+
# TODO: If/when pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
250+
bad_locs = np.isnan(dates)
251+
has_bad_values = False
252+
if bad_locs.any():
253+
has_bad_values = True
254+
data_col = Series(dates)
255+
data_col[bad_locs] = 1.0 # Replace with NaT
256+
dates = dates.astype(np.int64)
257+
258+
if fmt in ["%tc", "tc"]: # Delta ms relative to base
259+
base = stata_epoch
260+
ms = dates
261+
conv_dates = convert_delta_safe(base, ms, 'ms')
262+
elif fmt in ["%tC", "tC"]:
263+
from warnings import warn
264+
265+
warn("Encountered %tC format. Leaving in Stata Internal Format.")
266+
conv_dates = Series(dates, dtype=np.object)
267+
if has_bad_values:
268+
conv_dates[bad_locs] = np.nan
269+
return conv_dates
270+
elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base
271+
base = stata_epoch
272+
days = dates
273+
conv_dates = convert_delta_safe(base, days, 'd')
274+
elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week
275+
year = stata_epoch.year + dates // 52
276+
days = (dates % 52) * 7
277+
conv_dates = convert_year_days_safe(year, days)
278+
elif fmt in ["%tm", "tm"]: # Delta months relative to base
279+
year = stata_epoch.year + dates // 12
280+
month = (dates % 12) + 1
281+
conv_dates = convert_year_month_safe(year, month)
282+
elif fmt in ["%tq", "tq"]: # Delta quarters relative to base
283+
year = stata_epoch.year + dates // 4
284+
month = (dates % 4) * 3 + 1
285+
conv_dates = convert_year_month_safe(year, month)
286+
elif fmt in ["%th", "th"]: # Delta half-years relative to base
287+
year = stata_epoch.year + dates // 2
288+
month = (dates % 2) * 6 + 1
289+
conv_dates = convert_year_month_safe(year, month)
290+
elif fmt in ["%ty", "ty"]: # Years -- not delta
291+
# TODO: Check about negative years, here, and raise or warn if needed
292+
year = dates
293+
month = np.ones_like(dates)
294+
conv_dates = convert_year_month_safe(year, month)
295+
else:
296+
raise ValueError("Date fmt %s not understood" % fmt)
297+
298+
if has_bad_values: # Restore NaT for bad values
299+
conv_dates[bad_locs] = NaT
300+
return conv_dates
301+
151302
def _datetime_to_stata_elapsed(date, fmt):
152303
"""
153304
Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime
@@ -477,6 +628,14 @@ def __init__(self, encoding):
477628
'f': np.float32(struct.unpack('<f', b'\x00\x00\x00\x7f')[0]),
478629
'd': np.float64(struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
479630
}
631+
self.NUMPY_TYPE_MAP = \
632+
{
633+
'b': 'i1',
634+
'h': 'i2',
635+
'l': 'i4',
636+
'f': 'f4',
637+
'd': 'f8'
638+
}
480639

481640
# Reserved words cannot be used as variable names
482641
self.RESERVED_WORDS = ('aggregate', 'array', 'boolean', 'break',
@@ -759,15 +918,6 @@ def _calcsize(self, fmt):
759918
return (type(fmt) is int and fmt
760919
or struct.calcsize(self.byteorder + fmt))
761920

762-
def _col_size(self, k=None):
763-
if k is None:
764-
return self.col_sizes
765-
else:
766-
return self.col_sizes[k]
767-
768-
def _unpack(self, fmt, byt):
769-
return struct.unpack(self.byteorder + fmt, byt)[0]
770-
771921
def _null_terminate(self, s):
772922
if compat.PY3 or self._encoding is not None: # have bytes not strings,
773923
# so must decode
@@ -784,55 +934,6 @@ def _null_terminate(self, s):
784934
except:
785935
return s
786936

787-
def _next(self):
788-
typlist = self.typlist
789-
if self.has_string_data:
790-
data = [None] * self.nvar
791-
for i in range(len(data)):
792-
if type(typlist[i]) is int:
793-
data[i] = self._null_terminate(
794-
self.path_or_buf.read(typlist[i])
795-
)
796-
else:
797-
data[i] = self._unpack(
798-
typlist[i], self.path_or_buf.read(self._col_size(i))
799-
)
800-
return data
801-
else:
802-
return lmap(
803-
lambda i: self._unpack(typlist[i],
804-
self.path_or_buf.read(
805-
self._col_size(i)
806-
)),
807-
range(self.nvar)
808-
)
809-
810-
811-
def _dataset(self):
812-
"""
813-
Returns a Python generator object for iterating over the dataset.
814-
815-
816-
Parameters
817-
----------
818-
819-
Returns
820-
-------
821-
Generator object for iterating over the dataset. Yields each row of
822-
observations as a list by default.
823-
824-
Notes
825-
-----
826-
If missing_values is True during instantiation of StataReader then
827-
observations with _StataMissingValue(s) are not filtered and should
828-
be handled by your applcation.
829-
"""
830-
831-
self.path_or_buf.seek(self.data_location)
832-
833-
for i in range(self.nobs):
834-
yield self._next()
835-
836937
def _read_value_labels(self):
837938
if self.format_version >= 117:
838939
self.path_or_buf.seek(self.seek_value_labels)
@@ -932,27 +1033,32 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
9321033
if self.format_version >= 117:
9331034
self._read_strls()
9341035

935-
stata_dta = self._dataset()
936-
937-
data = []
938-
for rownum, line in enumerate(stata_dta):
939-
# doesn't handle missing value objects, just casts
940-
# None will only work without missing value object.
941-
for i, val in enumerate(line):
942-
#NOTE: This will only be scalar types because missing strings
943-
# are empty not None in Stata
944-
if val is None:
945-
line[i] = np.nan
946-
data.append(tuple(line))
1036+
# Read data
1037+
count = self.nobs
1038+
dtype = [] # Convert struct data types to numpy data type
1039+
for i, typ in enumerate(self.typlist):
1040+
if typ in self.NUMPY_TYPE_MAP:
1041+
dtype.append(('s' + str(i), self.NUMPY_TYPE_MAP[typ]))
1042+
else:
1043+
dtype.append(('s' + str(i), 'S' + str(typ)))
1044+
dtype = np.dtype(dtype)
1045+
read_len = count * dtype.itemsize
1046+
self.path_or_buf.seek(self.data_location)
1047+
data = np.frombuffer(self.path_or_buf.read(read_len),dtype=dtype,count=count)
1048+
self._data_read = True
9471049

9481050
if convert_categoricals:
9491051
self._read_value_labels()
9501052

951-
# TODO: Refactor to use a dictionary constructor and the correct dtype from the start?
9521053
if len(data)==0:
9531054
data = DataFrame(columns=self.varlist, index=index)
9541055
else:
955-
data = DataFrame(data, columns=self.varlist, index=index)
1056+
data = DataFrame.from_records(data, index=index)
1057+
data.columns = self.varlist
1058+
1059+
for col, typ in zip(data, self.typlist):
1060+
if type(typ) is int:
1061+
data[col] = data[col].apply(self._null_terminate, convert_dtype=True,)
9561062

9571063
cols_ = np.where(self.dtyplist)[0]
9581064

@@ -1010,8 +1116,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
10101116
self.fmtlist))[0]
10111117
for i in cols:
10121118
col = data.columns[i]
1013-
data[col] = data[col].apply(_stata_elapsed_date_to_datetime,
1014-
args=(self.fmtlist[i],))
1119+
data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i])
10151120

10161121
if convert_categoricals:
10171122
cols = np.where(

pandas/io/tests/data/stata9_115.dta

2.18 KB
Binary file not shown.

pandas/io/tests/data/stata9_117.dta

2.18 KB
Binary file not shown.

0 commit comments

Comments
 (0)