diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 1db189fcc74e3..d608304511a08 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -481,7 +481,7 @@ Performance - Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) - Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`) - Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`) -- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`) +- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`, :issue:`8073`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c9a3104eec3f0..0cf57d3035db5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -13,18 +13,18 @@ import sys import struct +from dateutil.relativedelta import relativedelta from pandas.core.base import StringMixin from pandas.core.frame import DataFrame from pandas.core.series import Series from pandas.core.categorical import Categorical import datetime -from pandas import compat +from pandas import compat, to_timedelta, to_datetime from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \ zip -from pandas import isnull from pandas.io.common import get_filepath_or_buffer from pandas.lib import max_len_string_array, is_string_array -from pandas.tslib import NaT +from pandas.tslib import NaT, Timestamp def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None, @@ -62,6 +62,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] +stata_epoch = datetime.datetime(1960, 1, 1) def _stata_elapsed_date_to_datetime(date, fmt): """ Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime @@ -111,9 +112,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return NaT - date = int(date) - stata_epoch = datetime.datetime(1960, 1, 1) if fmt in ["%tc", "tc"]: from dateutil.relativedelta import relativedelta return stata_epoch + relativedelta(microseconds=date * 1000) @@ -148,6 +147,158 @@ def _stata_elapsed_date_to_datetime(date, fmt): raise ValueError("Date fmt %s not understood" % fmt) +def _stata_elapsed_date_to_datetime_vec(dates, fmt): + """ + Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + dates : array-like + The Stata Internal Format date to convert to datetime according to fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + Returns + + Returns + ------- + converted : Series + The converted dates + + Examples + -------- + >>> _stata_elapsed_date_to_datetime(52, "%tw") + datetime.datetime(1961, 1, 1, 0, 0) + + Notes + ----- + datetime/c - tc + milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day + datetime/C - tC - NOT IMPLEMENTED + milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds + date - td + days since 01jan1960 (01jan1960 = 0) + weekly date - tw + weeks since 1960w1 + This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. + The datetime value is the start of the week in terms of days in the + year, not ISO calendar weeks. + monthly date - tm + months since 1960m1 + quarterly date - tq + quarters since 1960q1 + half-yearly date - th + half-years since 1960h1 yearly + date - ty + years since 0000 + + If you don't have pandas with datetime support, then you can't do + milliseconds accurately. + """ + MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year + MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days + MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days + MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 + MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 + + def convert_year_month_safe(year, month): + """ + Convert year and month to datetimes, using pandas vectorized versions + when the date range falls within the range supported by pandas. Other + wise it falls back to a slower but more robust method using datetime. + """ + if year.max() < MAX_YEAR and year.min() > MIN_YEAR: + return to_datetime(100 * year + month, format='%Y%m') + else: + return Series( + [datetime.datetime(y, m, 1) for y, m in zip(year, month)]) + + def convert_year_days_safe(year, days): + """ + Converts year (e.g. 1999) and days since the start of the year to a + datetime or datetime64 Series + """ + if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: + return to_datetime(year, format='%Y') + to_timedelta(days, unit='d') + else: + value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) for + y, d in zip(year, days)] + return Series(value) + + def convert_delta_safe(base, deltas, unit): + """ + Convert base dates and deltas to datetimes, using pandas vectorized + versions if the deltas satisfy restrictions required to be expressed + as dates in pandas. + """ + if unit == 'd': + if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: + values = [base + relativedelta(days=int(d)) for d in deltas] + return Series(values) + elif unit == 'ms': + if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: + values = [base + relativedelta(microseconds=(int(d) * 1000)) for + d in deltas] + return Series(values) + else: + raise ValueError('format not understood') + + base = to_datetime(base) + deltas = to_timedelta(deltas, unit=unit) + return base + deltas + + # TODO: If/when pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly + bad_locs = np.isnan(dates) + has_bad_values = False + if bad_locs.any(): + has_bad_values = True + data_col = Series(dates) + data_col[bad_locs] = 1.0 # Replace with NaT + dates = dates.astype(np.int64) + + if fmt in ["%tc", "tc"]: # Delta ms relative to base + base = stata_epoch + ms = dates + conv_dates = convert_delta_safe(base, ms, 'ms') + elif fmt in ["%tC", "tC"]: + from warnings import warn + + warn("Encountered %tC format. Leaving in Stata Internal Format.") + conv_dates = Series(dates, dtype=np.object) + if has_bad_values: + conv_dates[bad_locs] = np.nan + return conv_dates + elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base + base = stata_epoch + days = dates + conv_dates = convert_delta_safe(base, days, 'd') + elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + year = stata_epoch.year + dates // 52 + days = (dates % 52) * 7 + conv_dates = convert_year_days_safe(year, days) + elif fmt in ["%tm", "tm"]: # Delta months relative to base + year = stata_epoch.year + dates // 12 + month = (dates % 12) + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%tq", "tq"]: # Delta quarters relative to base + year = stata_epoch.year + dates // 4 + month = (dates % 4) * 3 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%th", "th"]: # Delta half-years relative to base + year = stata_epoch.year + dates // 2 + month = (dates % 2) * 6 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%ty", "ty"]: # Years -- not delta + # TODO: Check about negative years, here, and raise or warn if needed + year = dates + month = np.ones_like(dates) + conv_dates = convert_year_month_safe(year, month) + else: + raise ValueError("Date fmt %s not understood" % fmt) + + if has_bad_values: # Restore NaT for bad values + conv_dates[bad_locs] = NaT + return conv_dates + def _datetime_to_stata_elapsed(date, fmt): """ Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime @@ -477,6 +628,14 @@ def __init__(self, encoding): 'f': np.float32(struct.unpack('= 117: self.path_or_buf.seek(self.seek_value_labels) @@ -932,27 +1033,32 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, if self.format_version >= 117: self._read_strls() - stata_dta = self._dataset() - - data = [] - for rownum, line in enumerate(stata_dta): - # doesn't handle missing value objects, just casts - # None will only work without missing value object. - for i, val in enumerate(line): - #NOTE: This will only be scalar types because missing strings - # are empty not None in Stata - if val is None: - line[i] = np.nan - data.append(tuple(line)) + # Read data + count = self.nobs + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(('s' + str(i), self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(('s' + str(i), 'S' + str(typ))) + dtype = np.dtype(dtype) + read_len = count * dtype.itemsize + self.path_or_buf.seek(self.data_location) + data = np.frombuffer(self.path_or_buf.read(read_len),dtype=dtype,count=count) + self._data_read = True if convert_categoricals: self._read_value_labels() - # TODO: Refactor to use a dictionary constructor and the correct dtype from the start? if len(data)==0: data = DataFrame(columns=self.varlist, index=index) else: - data = DataFrame(data, columns=self.varlist, index=index) + data = DataFrame.from_records(data, index=index) + data.columns = self.varlist + + for col, typ in zip(data, self.typlist): + if type(typ) is int: + data[col] = data[col].apply(self._null_terminate, convert_dtype=True,) cols_ = np.where(self.dtyplist)[0] @@ -1010,8 +1116,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, self.fmtlist))[0] for i in cols: col = data.columns[i] - data[col] = data[col].apply(_stata_elapsed_date_to_datetime, - args=(self.fmtlist[i],)) + data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i]) if convert_categoricals: cols = np.where( diff --git a/pandas/io/tests/data/stata9_115.dta b/pandas/io/tests/data/stata9_115.dta new file mode 100644 index 0000000000000..6c3b6ab4dc686 Binary files /dev/null and b/pandas/io/tests/data/stata9_115.dta differ diff --git a/pandas/io/tests/data/stata9_117.dta b/pandas/io/tests/data/stata9_117.dta new file mode 100644 index 0000000000000..6c3b6ab4dc686 Binary files /dev/null and b/pandas/io/tests/data/stata9_117.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 9d630bf83ced7..54c1dd20029ee 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -18,6 +18,7 @@ from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) import pandas.util.testing as tm +from pandas.tslib import NaT from pandas.util.misc import is_little_endian from pandas import compat @@ -77,6 +78,10 @@ def setUp(self): self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta') self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta') + self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') + self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + + def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -640,6 +645,43 @@ def test_missing_value_conversion(self): tm.assert_frame_equal(expected, parsed_115) tm.assert_frame_equal(expected, parsed_117) + def test_big_dates(self): + yr = [1960, 2000, 9999, 100] + mo = [1, 1, 12, 1] + dd = [1, 1, 31, 1] + hr = [0, 0, 23, 0] + mm = [0, 0, 59, 0] + ss = [0, 0, 59, 0] + expected = [] + for i in range(4): + row = [] + for j in range(7): + if j == 0: + row.append( + datetime(yr[i], mo[i], dd[i], hr[i], mm[i], ss[i])) + elif j == 6: + row.append(datetime(yr[i], 1, 1)) + else: + row.append(datetime(yr[i], mo[i], dd[i])) + expected.append(row) + expected.append([NaT] * 7) + columns = ['date_tc', 'date_td', 'date_tw', 'date_tm', 'date_tq', + 'date_th', 'date_ty'] + # Fixes for weekly, quarterly,half,year + expected[2][2] = datetime(9999,12,24) + expected[2][3] = datetime(9999,12,1) + expected[2][4] = datetime(9999,10,1) + expected[2][5] = datetime(9999,7,1) + + expected = DataFrame(expected, columns=columns, dtype=np.object) + + parsed_115 = read_stata(self.dta18_115) + parsed_117 = read_stata(self.dta18_117) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) + assert True + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/vb_suite/packers.py b/vb_suite/packers.py index cb933746bef83..403adbf289e1f 100644 --- a/vb_suite/packers.py +++ b/vb_suite/packers.py @@ -133,13 +133,13 @@ def remove(f): packers_write_stata = Benchmark("df.to_stata(f, {'index': 'tc'})", setup, cleanup="remove(f)", start_date=start_date) setup = common_setup + """ -df['int8_'] = [randint(-127,100) for _ in range(N)] -df['int16_'] = [randint(-127,100) for _ in range(N)] -df['int32_'] = [randint(-127,100) for _ in range(N)] +df['int8_'] = [randint(np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27) for _ in range(N)] +df['int16_'] = [randint(np.iinfo(np.int16).min, np.iinfo(np.int16).max - 27) for _ in range(N)] +df['int32_'] = [randint(np.iinfo(np.int32).min, np.iinfo(np.int32).max - 27) for _ in range(N)] df['float32_'] = np.array(randn(N), dtype=np.float32) df.to_stata(f, {'index': 'tc'}) """ -packers_read_stata_with_int = Benchmark("pd.read_stata(f)", setup, start_date=start_date) +packers_read_stata_with_validation = Benchmark("pd.read_stata(f)", setup, start_date=start_date) -packers_write_stata_with_int = Benchmark("df.to_stata(f, {'index': 'tc'})", setup, cleanup="remove(f)", start_date=start_date) +packers_write_stata_with_validation = Benchmark("df.to_stata(f, {'index': 'tc'})", setup, cleanup="remove(f)", start_date=start_date)