From 14b28a182ad7a3a8f62e44a2c7cf3e4bd93edcf3 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Mon, 18 Aug 2014 19:09:48 +0100 Subject: [PATCH] PERF: StataReader is slow StataReader does not make use of vectorized operations. To improve performance, the following changes have been made: * Use numpy.frombuffer to real the stored data in a single operation * Vectorize date conversion * Removal of unreachable private functions A test was added to verify the conversion of dates that are out of the range supported by datetime64[ns] --- doc/source/v0.15.0.txt | 2 +- pandas/io/stata.py | 263 +++++++++++++++++++--------- pandas/io/tests/data/stata9_115.dta | Bin 0 -> 2230 bytes pandas/io/tests/data/stata9_117.dta | Bin 0 -> 2230 bytes pandas/io/tests/test_stata.py | 42 +++++ vb_suite/packers.py | 10 +- 6 files changed, 232 insertions(+), 85 deletions(-) create mode 100644 pandas/io/tests/data/stata9_115.dta create mode 100644 pandas/io/tests/data/stata9_117.dta diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 1db189fcc74e3..d608304511a08 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -481,7 +481,7 @@ Performance - Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) - Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`) - Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`) -- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`) +- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`, :issue:`8073`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c9a3104eec3f0..0cf57d3035db5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -13,18 +13,18 @@ import sys import struct +from dateutil.relativedelta import relativedelta from pandas.core.base import StringMixin from pandas.core.frame import DataFrame from pandas.core.series import Series from pandas.core.categorical import Categorical import datetime -from pandas import compat +from pandas import compat, to_timedelta, to_datetime from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \ zip -from pandas import isnull from pandas.io.common import get_filepath_or_buffer from pandas.lib import max_len_string_array, is_string_array -from pandas.tslib import NaT +from pandas.tslib import NaT, Timestamp def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None, @@ -62,6 +62,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] +stata_epoch = datetime.datetime(1960, 1, 1) def _stata_elapsed_date_to_datetime(date, fmt): """ Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime @@ -111,9 +112,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return NaT - date = int(date) - stata_epoch = datetime.datetime(1960, 1, 1) if fmt in ["%tc", "tc"]: from dateutil.relativedelta import relativedelta return stata_epoch + relativedelta(microseconds=date * 1000) @@ -148,6 +147,158 @@ def _stata_elapsed_date_to_datetime(date, fmt): raise ValueError("Date fmt %s not understood" % fmt) +def _stata_elapsed_date_to_datetime_vec(dates, fmt): + """ + Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + dates : array-like + The Stata Internal Format date to convert to datetime according to fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + Returns + + Returns + ------- + converted : Series + The converted dates + + Examples + -------- + >>> _stata_elapsed_date_to_datetime(52, "%tw") + datetime.datetime(1961, 1, 1, 0, 0) + + Notes + ----- + datetime/c - tc + milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day + datetime/C - tC - NOT IMPLEMENTED + milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds + date - td + days since 01jan1960 (01jan1960 = 0) + weekly date - tw + weeks since 1960w1 + This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. + The datetime value is the start of the week in terms of days in the + year, not ISO calendar weeks. + monthly date - tm + months since 1960m1 + quarterly date - tq + quarters since 1960q1 + half-yearly date - th + half-years since 1960h1 yearly + date - ty + years since 0000 + + If you don't have pandas with datetime support, then you can't do + milliseconds accurately. + """ + MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year + MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days + MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days + MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 + MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 + + def convert_year_month_safe(year, month): + """ + Convert year and month to datetimes, using pandas vectorized versions + when the date range falls within the range supported by pandas. Other + wise it falls back to a slower but more robust method using datetime. + """ + if year.max() < MAX_YEAR and year.min() > MIN_YEAR: + return to_datetime(100 * year + month, format='%Y%m') + else: + return Series( + [datetime.datetime(y, m, 1) for y, m in zip(year, month)]) + + def convert_year_days_safe(year, days): + """ + Converts year (e.g. 1999) and days since the start of the year to a + datetime or datetime64 Series + """ + if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: + return to_datetime(year, format='%Y') + to_timedelta(days, unit='d') + else: + value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) for + y, d in zip(year, days)] + return Series(value) + + def convert_delta_safe(base, deltas, unit): + """ + Convert base dates and deltas to datetimes, using pandas vectorized + versions if the deltas satisfy restrictions required to be expressed + as dates in pandas. + """ + if unit == 'd': + if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: + values = [base + relativedelta(days=int(d)) for d in deltas] + return Series(values) + elif unit == 'ms': + if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: + values = [base + relativedelta(microseconds=(int(d) * 1000)) for + d in deltas] + return Series(values) + else: + raise ValueError('format not understood') + + base = to_datetime(base) + deltas = to_timedelta(deltas, unit=unit) + return base + deltas + + # TODO: If/when pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly + bad_locs = np.isnan(dates) + has_bad_values = False + if bad_locs.any(): + has_bad_values = True + data_col = Series(dates) + data_col[bad_locs] = 1.0 # Replace with NaT + dates = dates.astype(np.int64) + + if fmt in ["%tc", "tc"]: # Delta ms relative to base + base = stata_epoch + ms = dates + conv_dates = convert_delta_safe(base, ms, 'ms') + elif fmt in ["%tC", "tC"]: + from warnings import warn + + warn("Encountered %tC format. Leaving in Stata Internal Format.") + conv_dates = Series(dates, dtype=np.object) + if has_bad_values: + conv_dates[bad_locs] = np.nan + return conv_dates + elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base + base = stata_epoch + days = dates + conv_dates = convert_delta_safe(base, days, 'd') + elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + year = stata_epoch.year + dates // 52 + days = (dates % 52) * 7 + conv_dates = convert_year_days_safe(year, days) + elif fmt in ["%tm", "tm"]: # Delta months relative to base + year = stata_epoch.year + dates // 12 + month = (dates % 12) + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%tq", "tq"]: # Delta quarters relative to base + year = stata_epoch.year + dates // 4 + month = (dates % 4) * 3 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%th", "th"]: # Delta half-years relative to base + year = stata_epoch.year + dates // 2 + month = (dates % 2) * 6 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%ty", "ty"]: # Years -- not delta + # TODO: Check about negative years, here, and raise or warn if needed + year = dates + month = np.ones_like(dates) + conv_dates = convert_year_month_safe(year, month) + else: + raise ValueError("Date fmt %s not understood" % fmt) + + if has_bad_values: # Restore NaT for bad values + conv_dates[bad_locs] = NaT + return conv_dates + def _datetime_to_stata_elapsed(date, fmt): """ Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime @@ -477,6 +628,14 @@ def __init__(self, encoding): 'f': np.float32(struct.unpack('= 117: self.path_or_buf.seek(self.seek_value_labels) @@ -932,27 +1033,32 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, if self.format_version >= 117: self._read_strls() - stata_dta = self._dataset() - - data = [] - for rownum, line in enumerate(stata_dta): - # doesn't handle missing value objects, just casts - # None will only work without missing value object. - for i, val in enumerate(line): - #NOTE: This will only be scalar types because missing strings - # are empty not None in Stata - if val is None: - line[i] = np.nan - data.append(tuple(line)) + # Read data + count = self.nobs + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(('s' + str(i), self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(('s' + str(i), 'S' + str(typ))) + dtype = np.dtype(dtype) + read_len = count * dtype.itemsize + self.path_or_buf.seek(self.data_location) + data = np.frombuffer(self.path_or_buf.read(read_len),dtype=dtype,count=count) + self._data_read = True if convert_categoricals: self._read_value_labels() - # TODO: Refactor to use a dictionary constructor and the correct dtype from the start? if len(data)==0: data = DataFrame(columns=self.varlist, index=index) else: - data = DataFrame(data, columns=self.varlist, index=index) + data = DataFrame.from_records(data, index=index) + data.columns = self.varlist + + for col, typ in zip(data, self.typlist): + if type(typ) is int: + data[col] = data[col].apply(self._null_terminate, convert_dtype=True,) cols_ = np.where(self.dtyplist)[0] @@ -1010,8 +1116,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, self.fmtlist))[0] for i in cols: col = data.columns[i] - data[col] = data[col].apply(_stata_elapsed_date_to_datetime, - args=(self.fmtlist[i],)) + data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i]) if convert_categoricals: cols = np.where( diff --git a/pandas/io/tests/data/stata9_115.dta b/pandas/io/tests/data/stata9_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..6c3b6ab4dc68692342f4d4d0b9d7cf1cea8f2d9a GIT binary patch literal 2230 zcmeH}zl#$=6vyZM!D**ppZxRc5--)qMdd&7FM2}&di%lW@Dk-s*~dL-ka~6-Ffq41DWEAHzLIu zv;~iZWFSQ%cqUk_b`ogrKy#o(R~o^&3v)onF3huO0(73i0^3c9HWwHqyeSf}T@P9xjx{drA*VAJWS30LvdW7Lp3YLmN-$Oy$4;5_`42nVDvlYM@L)Mil>10YT)e*Z0yR_06 zKu2A0wJTGuGFH{KiT#m&XIE8JHetDJLRvOqi3DXPQC01-34?M8K=T6Yt(f#h!=AYY zt`NCQ&7u6;o7t0-Qy-sWTab*AXmzhCgu6XRpPZdFcJTgun8!^2cc*i=#z?QGP*2ia z|9`-LpoZR!?=}*&+FbIM5>j7%omR$x{u0sGC?9>m%&z+5wthb(WcXQLk+U}E&ZhQt z3)hik#W8ZchzuWuou&zUPjQkYq6=wtc!;mOyQJAX73i)pPXDu!&B%ZxRc5--)qMdd&7FM2}&di%lW@Dk-s*~dL-ka~6-Ffq41DWEAHzLIu zv;~iZWFSQ%cqUk_b`ogrKy#o(R~o^&3v)onF3huO0(73i0^3c9HWwHqyeSf}T@P9xjx{drA*VAJWS30LvdW7Lp3YLmN-$Oy$4;5_`42nVDvlYM@L)Mil>10YT)e*Z0yR_06 zKu2A0wJTGuGFH{KiT#m&XIE8JHetDJLRvOqi3DXPQC01-34?M8K=T6Yt(f#h!=AYY zt`NCQ&7u6;o7t0-Qy-sWTab*AXmzhCgu6XRpPZdFcJTgun8!^2cc*i=#z?QGP*2ia z|9`-LpoZR!?=}*&+FbIM5>j7%omR$x{u0sGC?9>m%&z+5wthb(WcXQLk+U}E&ZhQt z3)hik#W8ZchzuWuou&zUPjQkYq6=wtc!;mOyQJAX73i)pPXDu!&B%