diff --git a/doc/source/io.rst b/doc/source/io.rst index baf684056e169..6e5d254d27b7f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3558,6 +3558,13 @@ read and used to create a ``Categorical`` variable from them. Value labels can also be retrieved by the function ``variable_labels``, which requires data to be called before (see ``pandas.io.stata.StataReader``). +The parameter ``convert_missing`` indicates whether missing value +representations in Stata should be preserved. If ``False`` (the default), +missing values are represented as ``np.nan``. If ``True``, missing values are +represented using ``StataMissingValue`` objects, and columns containing missing +values will have ``dtype`` set to ``object``. + + The StataReader supports .dta Formats 104, 105, 108, 113-115 and 117. Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 40a95ab103b0b..85f620fcd4b99 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -144,6 +144,11 @@ API changes strings must contain 244 or fewer characters. Attempting to write Stata dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`) +- ``read_stata`` and ``StataReader`` can import missing data information into a + ``DataFrame`` by setting the argument ``convert_missing`` to ``True``. When + using this options, missing values are returned as ``StataMissingValue`` + objects and columns containing missing values have ``object`` data type. (:issue:`8045`) + - ``Index.isin`` now supports a ``level`` argument to specify which index level to use for membership tests (:issue:`7892`, :issue:`7890`) @@ -414,6 +419,7 @@ Performance - Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) - Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`) - Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`) +- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5b5ce3e59e16e..c9a3104eec3f0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -9,7 +9,6 @@ You can find more information on http://presbrey.mit.edu/PyDTA and http://statsmodels.sourceforge.net/devel/ """ -# TODO: Fix this module so it can use cross-compatible zip, map, and range import numpy as np import sys @@ -20,14 +19,16 @@ from pandas.core.categorical import Categorical import datetime from pandas import compat -from pandas.compat import long, lrange, lmap, lzip, text_type, string_types +from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \ + zip from pandas import isnull from pandas.io.common import get_filepath_or_buffer from pandas.lib import max_len_string_array, is_string_array from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, - convert_categoricals=True, encoding=None, index=None): + convert_categoricals=True, encoding=None, index=None, + convert_missing=False): """ Read Stata file into DataFrame @@ -44,10 +45,19 @@ def read_stata(filepath_or_buffer, convert_dates=True, support unicode. None defaults to cp1252. index : identifier of index column identifier of column that should be used as index of the DataFrame + convert_missing : boolean, defaults to False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nans. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. """ reader = StataReader(filepath_or_buffer, encoding) - return reader.data(convert_dates, convert_categoricals, index) + return reader.data(convert_dates, + convert_categoricals, + index, + convert_missing) _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -291,35 +301,76 @@ class StataMissingValue(StringMixin): Parameters ----------- - offset - value + value : int8, int16, int32, float32 or float64 + The Stata missing value code Attributes ---------- - string - value + string : string + String representation of the Stata missing value + value : int8, int16, int32, float32 or float64 + The original encoded missing value Notes ----- More information: + + Integer missing values make the code '.', '.a', ..., '.z' to the ranges + 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ... + 2147483647 (for int32). Missing values for floating point data types are + more complex but the pattern is simple to discern from the following table. + + np.float32 missing values (float in Stata) + 0000007f . + 0008007f .a + 0010007f .b + ... + 00c0007f .x + 00c8007f .y + 00d0007f .z + + np.float64 missing values (double in Stata) + 000000000000e07f . + 000000000001e07f .a + 000000000002e07f .b + ... + 000000000018e07f .x + 000000000019e07f .y + 00000000001ae07f .z """ - # TODO: Needs test - def __init__(self, offset, value): + + # Construct a dictionary of missing values + MISSING_VALUES = {} + bases = (101, 32741, 2147483621) + for b in bases: + MISSING_VALUES[b] = '.' + for i in range(1, 27): + MISSING_VALUES[i + b] = '.' + chr(96 + i) + + base = b'\x00\x00\x00\x7f' + increment = struct.unpack(' 0: + MISSING_VALUES[value] += chr(96 + i) + int_value = struct.unpack(' 0: + MISSING_VALUES[value] += chr(96 + i) + int_value = struct.unpack('q', struct.pack(' nmax: - if self._missing_values: - return StataMissingValue(nmax, d) - else: - return None - return d + return struct.unpack(self.byteorder + fmt, byt)[0] def _null_terminate(self, s): if compat.PY3 or self._encoding is not None: # have bytes not strings, @@ -752,16 +799,15 @@ def _next(self): ) return data else: - return list( - map( + return lmap( lambda i: self._unpack(typlist[i], self.path_or_buf.read( self._col_size(i) )), range(self.nvar) - ) ) + def _dataset(self): """ Returns a Python generator object for iterating over the dataset. @@ -853,7 +899,8 @@ def _read_strls(self): self.GSO[v_o] = self.path_or_buf.read(length-1) self.path_or_buf.read(1) # zero-termination - def data(self, convert_dates=True, convert_categoricals=True, index=None): + def data(self, convert_dates=True, convert_categoricals=True, index=None, + convert_missing=False): """ Reads observations from Stata file, converting them into a dataframe @@ -866,11 +913,18 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): variables index : identifier of index column identifier of column that should be used as index of the DataFrame + convert_missing : boolean, defaults to False + Flag indicating whether to convert missing values to their Stata + representation. If False, missing values are replaced with + nans. If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. Returns ------- y : DataFrame instance """ + self._missing_values = convert_missing if self._data_read: raise Exception("Data has already been read.") self._data_read = True @@ -894,18 +948,62 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): if convert_categoricals: self._read_value_labels() + # TODO: Refactor to use a dictionary constructor and the correct dtype from the start? if len(data)==0: data = DataFrame(columns=self.varlist, index=index) else: data = DataFrame(data, columns=self.varlist, index=index) cols_ = np.where(self.dtyplist)[0] + + # Convert columns (if needed) to match input type + index = data.index + requires_type_conversion = False + data_formatted = [] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] - if data[col].dtype is not np.dtype(object): - data[col] = Series(data[col], data[col].index, - self.dtyplist[i]) + dtype = data[col].dtype + if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): + requires_type_conversion = True + data_formatted.append((col, Series(data[col], index, self.dtyplist[i]))) + else: + data_formatted.append((col, data[col])) + if requires_type_conversion: + data = DataFrame.from_items(data_formatted) + del data_formatted + + # Check for missing values, and replace if found + for i, colname in enumerate(data): + fmt = self.typlist[i] + if fmt not in self.VALID_RANGE: + continue + + nmin, nmax = self.VALID_RANGE[fmt] + series = data[colname] + missing = np.logical_or(series < nmin, series > nmax) + + if not missing.any(): + continue + + if self._missing_values: # Replacement follows Stata notation + missing_loc = np.argwhere(missing) + umissing, umissing_loc = np.unique(series[missing], + return_inverse=True) + replacement = Series(series, dtype=np.object) + for i, um in enumerate(umissing): + missing_value = StataMissingValue(um) + + loc = missing_loc[umissing_loc == i] + replacement.iloc[loc] = missing_value + else: # All replacements are identical + dtype = series.dtype + if dtype not in (np.float32, np.float64): + dtype = np.float64 + replacement = Series(series, dtype=dtype) + replacement[missing] = np.nan + + data[colname] = replacement if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, diff --git a/pandas/io/tests/data/stata8_113.dta b/pandas/io/tests/data/stata8_113.dta new file mode 100644 index 0000000000000..9b0831746025e Binary files /dev/null and b/pandas/io/tests/data/stata8_113.dta differ diff --git a/pandas/io/tests/data/stata8_115.dta b/pandas/io/tests/data/stata8_115.dta new file mode 100644 index 0000000000000..bb78368b3462b Binary files /dev/null and b/pandas/io/tests/data/stata8_115.dta differ diff --git a/pandas/io/tests/data/stata8_117.dta b/pandas/io/tests/data/stata8_117.dta new file mode 100644 index 0000000000000..fcfa7abd7b0d9 Binary files /dev/null and b/pandas/io/tests/data/stata8_117.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 459a1fe6c0e89..9d630bf83ced7 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -5,16 +5,18 @@ import os import warnings import nose +import struct import sys from distutils.version import LooseVersion import numpy as np import pandas as pd +from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, - PossiblePrecisionLoss) + PossiblePrecisionLoss, StataMissingValue) import pandas.util.testing as tm from pandas.util.misc import is_little_endian from pandas import compat @@ -71,6 +73,10 @@ def setUp(self): self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta') + self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta') + self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta') + self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta') + def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -589,6 +595,50 @@ def test_excessively_long_string(self): with tm.ensure_clean() as path: original.to_stata(path) + def test_missing_value_generator(self): + types = ('b','h','l') + df = DataFrame([[0.0]],columns=['float_']) + with tm.ensure_clean() as path: + df.to_stata(path) + valid_range = StataReader(path).VALID_RANGE + expected_values = ['.' + chr(97 + i) for i in range(26)] + expected_values.insert(0, '.') + for t in types: + offset = valid_range[t][1] + for i in range(0,27): + val = StataMissingValue(offset+1+i) + self.assertTrue(val.string == expected_values[i]) + + # Test extremes for floats + val = StataMissingValue(struct.unpack('