From d759156819fab85efa14b5f28836361f238ea581 Mon Sep 17 00:00:00 2001 From: agraboso Date: Mon, 8 Aug 2016 14:55:28 -0400 Subject: [PATCH 01/14] BUG: properly close files opened by parsers --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/io/common.py | 4 +++- pandas/io/parsers.py | 22 +++++++++++++------- pandas/io/tests/parser/common.py | 1 + pandas/io/tests/parser/python_parser_only.py | 3 ++- pandas/io/tests/parser/test_textreader.py | 3 ++- 6 files changed, 25 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4f81eafa3adaf..b09ebf9f9a5dd 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -902,6 +902,8 @@ Bug Fixes - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.read_csv``, ``pd.read_table`` and ``pd.read_stata`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) +- Bug in ``StataReader`` and ``StataWriter`` where a file was not properly closed when an error was raised. (:issue:`13940`) - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6f9bddd0fdf9b..b7ac183b7ab41 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -327,7 +327,9 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): if memory_map and hasattr(f, 'fileno'): try: - f = MMapWrapper(f) + g = MMapWrapper(f) + f.close() + f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7846ccd1a6660..30023afa5a26a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -393,11 +393,15 @@ def _read(filepath_or_buffer, kwds): raise NotImplementedError("'nrows' and 'chunksize' cannot be used" " together yet.") elif nrows is not None: - return parser.read(nrows) + data = parser.read(nrows) + parser.close() + return data elif chunksize or iterator: return parser - return parser.read() + data = parser.read() + parser.close() + return data _parser_defaults = { 'delimiter': None, @@ -727,10 +731,7 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - try: - self._engine._reader.close() - except: - pass + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -1057,8 +1058,13 @@ def __init__(self, kwds): self._first_chunk = True + # GH 13932 + # keep references to file handles opened by the parser itself + self.handles = [] + def close(self): - self._reader.close() + for f in self.handles: + f.close() @property def _has_complex_date_col(self): @@ -1356,6 +1362,7 @@ def __init__(self, src, **kwds): if 'utf-16' in (kwds.get('encoding') or ''): if isinstance(src, compat.string_types): src = open(src, 'rb') + self.handles.append(src) src = UTF8Recoder(src, kwds['encoding']) kwds['encoding'] = 'utf-8' @@ -1760,6 +1767,7 @@ def __init__(self, f, **kwds): f = TextIOWrapper(f, encoding=self.encoding) # Set self.data to something that can read lines. + self.handles.append(f) if hasattr(f, 'readline'): self._make_reader(f) else: diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 619ac7b4c77ef..96eb0ec6fd7a2 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1580,5 +1580,6 @@ def test_temporary_file(self): new_file.seek(0) result = self.read_csv(new_file, sep='\s+', header=None) + new_file.close() expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index a7389fd174e1d..3214aa39358e8 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -130,7 +130,8 @@ def test_decompression_regex_sep(self): except ImportError: raise nose.SkipTest('need gzip and bz2 to run') - data = open(self.csv1, 'rb').read() + with open(self.csv1, 'rb') as f: + data = f.read() data = data.replace(b',', b'::') expected = self.read_csv(self.csv1) diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index fd2f49cef656a..7dda9eb9d0af4 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -54,7 +54,8 @@ def test_file_handle_mmap(self): f.close() def test_StringIO(self): - text = open(self.csv1, 'rb').read() + with open(self.csv1, 'rb') as f: + text = f.read() src = BytesIO(text) reader = TextReader(src, header=None) reader.read() From 1e39a5e48651f45369953be1386192918e5442ee Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 10:17:56 -0400 Subject: [PATCH 02/14] Properly close opened files in three tests --- pandas/io/tests/test_common.py | 11 +++++------ pandas/tests/series/test_io.py | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index a443df5dac586..c08d235b07c9e 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -116,8 +116,8 @@ def test_constructor_bad_file(self): tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target) def test_get_attr(self): - target = open(self.mmap_file, 'r') - wrapper = common.MMapWrapper(target) + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) attrs = [attr for attr in attrs @@ -130,10 +130,9 @@ def test_get_attr(self): self.assertFalse(hasattr(wrapper, 'foo')) def test_next(self): - target = open(self.mmap_file, 'r') - wrapper = common.MMapWrapper(target) - - lines = target.readlines() + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) + lines = target.readlines() for line in lines: next_line = next(wrapper) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f89501d39f014..48528dc54adbd 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -64,7 +64,8 @@ def test_to_csv(self): with ensure_clean() as path: self.ts.to_csv(path) - lines = io.open(path, newline=None).readlines() + with io.open(path, newline=None) as f: + lines = f.readlines() assert (lines[1] != '\n') self.ts.to_csv(path, index=False) From 3b0f25f87eb259f38e13a1d781418fb480a4228d Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 10:18:49 -0400 Subject: [PATCH 03/14] Properly close opened files in StataReader --- pandas/io/stata.py | 282 ++++++++++++++++++++++----------------------- 1 file changed, 141 insertions(+), 141 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 59bc24acac6f8..13244deda00d0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -167,15 +167,11 @@ def read_stata(filepath_or_buffer, convert_dates=True, chunksize=chunksize, encoding=encoding) if iterator or chunksize: - try: - return reader - except StopIteration: - reader.close() - - try: - return reader.read() - finally: + data = reader + else: + data = reader.read() reader.close() + return data _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -1411,150 +1407,154 @@ def read(self, nrows=None, convert_dates=None, convert_categoricals=None, index=None, convert_missing=None, preserve_dtypes=None, columns=None, order_categoricals=None): - # Handle empty file or chunk. If reading incrementally raise # StopIteration. If reading the whole thing return an empty # data frame. if (self.nobs == 0) and (nrows is None): self._can_read_value_labels = True self._data_read = True + self.close() return DataFrame(columns=self.varlist) - # Handle options - if convert_dates is None: - convert_dates = self._convert_dates - if convert_categoricals is None: - convert_categoricals = self._convert_categoricals - if convert_missing is None: - convert_missing = self._convert_missing - if preserve_dtypes is None: - preserve_dtypes = self._preserve_dtypes - if columns is None: - columns = self._columns - if order_categoricals is None: - order_categoricals = self._order_categoricals - - if nrows is None: - nrows = self.nobs - - if (self.format_version >= 117) and (self._dtype is None): - self._can_read_value_labels = True - self._read_strls() - - # Setup the dtype. - if self._dtype is None: - dtype = [] # Convert struct data types to numpy data type - for i, typ in enumerate(self.typlist): - if typ in self.NUMPY_TYPE_MAP: - dtype.append(('s' + str(i), self.byteorder + - self.NUMPY_TYPE_MAP[typ])) - else: - dtype.append(('s' + str(i), 'S' + str(typ))) - dtype = np.dtype(dtype) - self._dtype = dtype - - # Read data - dtype = self._dtype - max_read_len = (self.nobs - self._lines_read) * dtype.itemsize - read_len = nrows * dtype.itemsize - read_len = min(read_len, max_read_len) - if read_len <= 0: - # Iterator has finished, should never be here unless - # we are reading the file incrementally + try: + # Handle options + if convert_dates is None: + convert_dates = self._convert_dates + if convert_categoricals is None: + convert_categoricals = self._convert_categoricals + if convert_missing is None: + convert_missing = self._convert_missing + if preserve_dtypes is None: + preserve_dtypes = self._preserve_dtypes + if columns is None: + columns = self._columns + if order_categoricals is None: + order_categoricals = self._order_categoricals + + if nrows is None: + nrows = self.nobs + + if (self.format_version >= 117) and (self._dtype is None): + self._can_read_value_labels = True + self._read_strls() + + # Setup the dtype. + if self._dtype is None: + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(('s' + str(i), self.byteorder + + self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(('s' + str(i), 'S' + str(typ))) + dtype = np.dtype(dtype) + self._dtype = dtype + + # Read data + dtype = self._dtype + max_read_len = (self.nobs - self._lines_read) * dtype.itemsize + read_len = nrows * dtype.itemsize + read_len = min(read_len, max_read_len) + if read_len <= 0: + # Iterator has finished, should never be here unless + # we are reading the file incrementally + if convert_categoricals: + self._read_value_labels() + raise StopIteration + offset = self._lines_read * dtype.itemsize + self.path_or_buf.seek(self.data_location + offset) + read_lines = min(nrows, self.nobs - self._lines_read) + data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype, + count=read_lines) + + self._lines_read += read_lines + if self._lines_read == self.nobs: + self._can_read_value_labels = True + self._data_read = True + # if necessary, swap the byte order to native here + if self.byteorder != self._native_byteorder: + data = data.byteswap().newbyteorder() + if convert_categoricals: self._read_value_labels() - raise StopIteration - offset = self._lines_read * dtype.itemsize - self.path_or_buf.seek(self.data_location + offset) - read_lines = min(nrows, self.nobs - self._lines_read) - data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype, - count=read_lines) - - self._lines_read += read_lines - if self._lines_read == self.nobs: - self._can_read_value_labels = True - self._data_read = True - # if necessary, swap the byte order to native here - if self.byteorder != self._native_byteorder: - data = data.byteswap().newbyteorder() - - if convert_categoricals: - self._read_value_labels() - if len(data) == 0: - data = DataFrame(columns=self.varlist, index=index) - else: - data = DataFrame.from_records(data, index=index) - data.columns = self.varlist - - # If index is not specified, use actual row number rather than - # restarting at 0 for each chunk. - if index is None: - ix = np.arange(self._lines_read - read_lines, self._lines_read) - data = data.set_index(ix) - - if columns is not None: - data = self._do_select_columns(data, columns) - - # Decode strings - for col, typ in zip(data, self.typlist): - if type(typ) is int: - data[col] = data[col].apply( - self._null_terminate, convert_dtype=True) - - data = self._insert_strls(data) - - cols_ = np.where(self.dtyplist)[0] - - # Convert columns (if needed) to match input type - index = data.index - requires_type_conversion = False - data_formatted = [] - for i in cols_: - if self.dtyplist[i] is not None: - col = data.columns[i] - dtype = data[col].dtype - if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): - requires_type_conversion = True - data_formatted.append( - (col, Series(data[col], index, self.dtyplist[i]))) - else: - data_formatted.append((col, data[col])) - if requires_type_conversion: - data = DataFrame.from_items(data_formatted) - del data_formatted - - self._do_convert_missing(data, convert_missing) - - if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, - self.fmtlist))[0] - for i in cols: - col = data.columns[i] - data[col] = _stata_elapsed_date_to_datetime_vec( - data[col], - self.fmtlist[i]) - - if convert_categoricals and self.format_version > 108: - data = self._do_convert_categoricals(data, - self.value_label_dict, - self.lbllist, - order_categoricals) - - if not preserve_dtypes: - retyped_data = [] - convert = False - for col in data: - dtype = data[col].dtype - if dtype in (np.float16, np.float32): - dtype = np.float64 - convert = True - elif dtype in (np.int8, np.int16, np.int32): - dtype = np.int64 - convert = True - retyped_data.append((col, data[col].astype(dtype))) - if convert: - data = DataFrame.from_items(retyped_data) + if len(data) == 0: + data = DataFrame(columns=self.varlist, index=index) + else: + data = DataFrame.from_records(data, index=index) + data.columns = self.varlist + + # If index is not specified, use actual row number rather than + # restarting at 0 for each chunk. + if index is None: + ix = np.arange(self._lines_read - read_lines, self._lines_read) + data = data.set_index(ix) + + if columns is not None: + data = self._do_select_columns(data, columns) + + # Decode strings + for col, typ in zip(data, self.typlist): + if type(typ) is int: + data[col] = data[col].apply( + self._null_terminate, convert_dtype=True) + + data = self._insert_strls(data) + + cols_ = np.where(self.dtyplist)[0] + + # Convert columns (if needed) to match input type + index = data.index + requires_type_conversion = False + data_formatted = [] + for i in cols_: + if self.dtyplist[i] is not None: + col = data.columns[i] + dtype = data[col].dtype + if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): + requires_type_conversion = True + data_formatted.append( + (col, Series(data[col], index, self.dtyplist[i]))) + else: + data_formatted.append((col, data[col])) + if requires_type_conversion: + data = DataFrame.from_items(data_formatted) + del data_formatted + + self._do_convert_missing(data, convert_missing) + + if convert_dates: + cols = np.where(lmap(lambda x: x in _date_formats, + self.fmtlist))[0] + for i in cols: + col = data.columns[i] + data[col] = _stata_elapsed_date_to_datetime_vec( + data[col], + self.fmtlist[i]) + + if convert_categoricals and self.format_version > 108: + data = self._do_convert_categoricals(data, + self.value_label_dict, + self.lbllist, + order_categoricals) + + if not preserve_dtypes: + retyped_data = [] + convert = False + for col in data: + dtype = data[col].dtype + if dtype in (np.float16, np.float32): + dtype = np.float64 + convert = True + elif dtype in (np.int8, np.int16, np.int32): + dtype = np.int64 + convert = True + retyped_data.append((col, data[col].astype(dtype))) + if convert: + data = DataFrame.from_items(retyped_data) + except: + self.close() + raise return data From 30b61e60f358677289d8325df6a06f5cf98453bc Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 10:43:10 -0400 Subject: [PATCH 04/14] Properly close opened files in StataWriter --- pandas/io/stata.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 13244deda00d0..9a1a905479de8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1881,9 +1881,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._file = _open_file_binary_write( - fname, self._encoding or self._default_encoding - ) + self._fname = fname self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): @@ -2078,16 +2076,21 @@ def _prepare_pandas(self, data): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header(time_stamp=self._time_stamp, - data_label=self._data_label) - self._write_descriptors() - self._write_variable_labels() - # write 5 zeros for expansion fields - self._write(_pad_bytes("", 5)) - self._prepare_data() - self._write_data() - self._write_value_labels() - self._file.close() + self._file = _open_file_binary_write( + self._fname, self._encoding or self._default_encoding + ) + try: + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) + self._write_descriptors() + self._write_variable_labels() + # write 5 zeros for expansion fields + self._write(_pad_bytes("", 5)) + self._prepare_data() + self._write_data() + self._write_value_labels() + finally: + self._file.close() def _write_value_labels(self): for vl in self._value_labels: From 99e16dd60c53d5fb007e7bb857c654e3d58a1e30 Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 14:11:13 -0400 Subject: [PATCH 05/14] Fix whatsnew entries --- doc/source/whatsnew/v0.19.0.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index b09ebf9f9a5dd..86f509ad8656e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -904,6 +904,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) - Bug in ``pd.read_csv``, ``pd.read_table`` and ``pd.read_stata`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) - Bug in ``StataReader`` and ``StataWriter`` where a file was not properly closed when an error was raised. (:issue:`13940`) + - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) @@ -974,4 +975,7 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) +<<<<<<< 30b61e60f358677289d8325df6a06f5cf98453bc - Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) +======= +>>>>>>> Fix whatsnew entries From c7e9c9c34684f7ee4e45c69d68655037813bb680 Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 14:37:17 -0400 Subject: [PATCH 06/14] On close, CParserWrapper must call self._reader.close() --- pandas/io/parsers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 30023afa5a26a..a24a3f22f8325 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1436,6 +1436,14 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 + def close(self): + for f in self.handles: + f.close() + try: + self._reader.close() + except: + pass + def _set_noconvert_columns(self): names = self.orig_names usecols = self.usecols From 812e6ecddb1fa6c93329c2c165921a9c920e7e00 Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 15:14:42 -0400 Subject: [PATCH 07/14] Fix long line --- pandas/io/stata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 9a1a905479de8..1156a07c6e9c7 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1511,7 +1511,8 @@ def read(self, nrows=None, convert_dates=None, if self.dtyplist[i] is not None: col = data.columns[i] dtype = data[col].dtype - if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): + if ((dtype != np.dtype(object)) and + (dtype != self.dtyplist[i])): requires_type_conversion = True data_formatted.append( (col, Series(data[col], index, self.dtyplist[i]))) From 75fc34d213990f9a8b9fa48a65e045da492394fc Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 19:03:18 -0400 Subject: [PATCH 08/14] Make try/except blocks in StataReader.read as small as possible --- pandas/io/stata.py | 265 +++++++++++++++++++++++---------------------- 1 file changed, 135 insertions(+), 130 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1156a07c6e9c7..469d7095e7d71 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1416,146 +1416,151 @@ def read(self, nrows=None, convert_dates=None, self.close() return DataFrame(columns=self.varlist) - try: - # Handle options - if convert_dates is None: - convert_dates = self._convert_dates - if convert_categoricals is None: - convert_categoricals = self._convert_categoricals - if convert_missing is None: - convert_missing = self._convert_missing - if preserve_dtypes is None: - preserve_dtypes = self._preserve_dtypes - if columns is None: - columns = self._columns - if order_categoricals is None: - order_categoricals = self._order_categoricals - - if nrows is None: - nrows = self.nobs - - if (self.format_version >= 117) and (self._dtype is None): - self._can_read_value_labels = True - self._read_strls() - - # Setup the dtype. - if self._dtype is None: - dtype = [] # Convert struct data types to numpy data type - for i, typ in enumerate(self.typlist): - if typ in self.NUMPY_TYPE_MAP: - dtype.append(('s' + str(i), self.byteorder + - self.NUMPY_TYPE_MAP[typ])) - else: - dtype.append(('s' + str(i), 'S' + str(typ))) - dtype = np.dtype(dtype) - self._dtype = dtype - - # Read data - dtype = self._dtype - max_read_len = (self.nobs - self._lines_read) * dtype.itemsize - read_len = nrows * dtype.itemsize - read_len = min(read_len, max_read_len) - if read_len <= 0: - # Iterator has finished, should never be here unless - # we are reading the file incrementally - if convert_categoricals: - self._read_value_labels() - raise StopIteration - offset = self._lines_read * dtype.itemsize - self.path_or_buf.seek(self.data_location + offset) - read_lines = min(nrows, self.nobs - self._lines_read) - data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype, - count=read_lines) - - self._lines_read += read_lines - if self._lines_read == self.nobs: - self._can_read_value_labels = True - self._data_read = True - # if necessary, swap the byte order to native here - if self.byteorder != self._native_byteorder: - data = data.byteswap().newbyteorder() - + # Handle options + if convert_dates is None: + convert_dates = self._convert_dates + if convert_categoricals is None: + convert_categoricals = self._convert_categoricals + if convert_missing is None: + convert_missing = self._convert_missing + if preserve_dtypes is None: + preserve_dtypes = self._preserve_dtypes + if columns is None: + columns = self._columns + if order_categoricals is None: + order_categoricals = self._order_categoricals + + if nrows is None: + nrows = self.nobs + + if (self.format_version >= 117) and (self._dtype is None): + self._can_read_value_labels = True + self._read_strls() + + # Setup the dtype. + if self._dtype is None: + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(('s' + str(i), self.byteorder + + self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(('s' + str(i), 'S' + str(typ))) + dtype = np.dtype(dtype) + self._dtype = dtype + + # Read data + dtype = self._dtype + max_read_len = (self.nobs - self._lines_read) * dtype.itemsize + read_len = nrows * dtype.itemsize + read_len = min(read_len, max_read_len) + if read_len <= 0: + # Iterator has finished, should never be here unless + # we are reading the file incrementally if convert_categoricals: self._read_value_labels() + self.close() + raise StopIteration + offset = self._lines_read * dtype.itemsize + self.path_or_buf.seek(self.data_location + offset) + read_lines = min(nrows, self.nobs - self._lines_read) + data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype, + count=read_lines) + + self._lines_read += read_lines + if self._lines_read == self.nobs: + self._can_read_value_labels = True + self._data_read = True + # if necessary, swap the byte order to native here + if self.byteorder != self._native_byteorder: + data = data.byteswap().newbyteorder() - if len(data) == 0: - data = DataFrame(columns=self.varlist, index=index) - else: - data = DataFrame.from_records(data, index=index) - data.columns = self.varlist + if convert_categoricals: + self._read_value_labels() - # If index is not specified, use actual row number rather than - # restarting at 0 for each chunk. - if index is None: - ix = np.arange(self._lines_read - read_lines, self._lines_read) - data = data.set_index(ix) + if len(data) == 0: + data = DataFrame(columns=self.varlist, index=index) + else: + data = DataFrame.from_records(data, index=index) + data.columns = self.varlist + + # If index is not specified, use actual row number rather than + # restarting at 0 for each chunk. + if index is None: + ix = np.arange(self._lines_read - read_lines, self._lines_read) + data = data.set_index(ix) - if columns is not None: + if columns is not None: + try: data = self._do_select_columns(data, columns) + except ValueError: + self.close() + raise - # Decode strings - for col, typ in zip(data, self.typlist): - if type(typ) is int: - data[col] = data[col].apply( - self._null_terminate, convert_dtype=True) - - data = self._insert_strls(data) - - cols_ = np.where(self.dtyplist)[0] - - # Convert columns (if needed) to match input type - index = data.index - requires_type_conversion = False - data_formatted = [] - for i in cols_: - if self.dtyplist[i] is not None: - col = data.columns[i] - dtype = data[col].dtype - if ((dtype != np.dtype(object)) and - (dtype != self.dtyplist[i])): - requires_type_conversion = True - data_formatted.append( - (col, Series(data[col], index, self.dtyplist[i]))) - else: - data_formatted.append((col, data[col])) - if requires_type_conversion: - data = DataFrame.from_items(data_formatted) - del data_formatted - - self._do_convert_missing(data, convert_missing) - - if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, - self.fmtlist))[0] - for i in cols: - col = data.columns[i] + # Decode strings + for col, typ in zip(data, self.typlist): + if type(typ) is int: + data[col] = data[col].apply( + self._null_terminate, convert_dtype=True) + + data = self._insert_strls(data) + + cols_ = np.where(self.dtyplist)[0] + + # Convert columns (if needed) to match input type + index = data.index + requires_type_conversion = False + data_formatted = [] + for i in cols_: + if self.dtyplist[i] is not None: + col = data.columns[i] + dtype = data[col].dtype + if ((dtype != np.dtype(object)) and + (dtype != self.dtyplist[i])): + requires_type_conversion = True + data_formatted.append( + (col, Series(data[col], index, self.dtyplist[i]))) + else: + data_formatted.append((col, data[col])) + if requires_type_conversion: + data = DataFrame.from_items(data_formatted) + del data_formatted + + self._do_convert_missing(data, convert_missing) + + if convert_dates: + cols = np.where(lmap(lambda x: x in _date_formats, + self.fmtlist))[0] + for i in cols: + col = data.columns[i] + try: data[col] = _stata_elapsed_date_to_datetime_vec( data[col], self.fmtlist[i]) - - if convert_categoricals and self.format_version > 108: - data = self._do_convert_categoricals(data, - self.value_label_dict, - self.lbllist, - order_categoricals) - - if not preserve_dtypes: - retyped_data = [] - convert = False - for col in data: - dtype = data[col].dtype - if dtype in (np.float16, np.float32): - dtype = np.float64 - convert = True - elif dtype in (np.int8, np.int16, np.int32): - dtype = np.int64 - convert = True - retyped_data.append((col, data[col].astype(dtype))) - if convert: - data = DataFrame.from_items(retyped_data) - except: - self.close() - raise + except ValueError: + self.close() + raise + + if convert_categoricals and self.format_version > 108: + data = self._do_convert_categoricals(data, + self.value_label_dict, + self.lbllist, + order_categoricals) + + if not preserve_dtypes: + retyped_data = [] + convert = False + for col in data: + dtype = data[col].dtype + if dtype in (np.float16, np.float32): + dtype = np.float64 + convert = True + elif dtype in (np.int8, np.int16, np.int32): + dtype = np.int64 + convert = True + retyped_data.append((col, data[col].astype(dtype))) + if convert: + data = DataFrame.from_items(retyped_data) return data From 39dcd9924be914d0fd40c4adb567468807a65b31 Mon Sep 17 00:00:00 2001 From: agraboso Date: Tue, 9 Aug 2016 20:00:43 -0400 Subject: [PATCH 09/14] Fix rebase --- doc/source/whatsnew/v0.19.0.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 86f509ad8656e..d71d8a5a17bd3 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -975,7 +975,4 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) -<<<<<<< 30b61e60f358677289d8325df6a06f5cf98453bc - Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`) -======= ->>>>>>> Fix whatsnew entries From 52d1073c6589784243e9a6e4636cedc8ff118c7f Mon Sep 17 00:00:00 2001 From: agraboso Date: Wed, 10 Aug 2016 08:56:38 -0400 Subject: [PATCH 10/14] Fix linting error --- pandas/io/stata.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 469d7095e7d71..e831bb2b95bf6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1515,8 +1515,7 @@ def read(self, nrows=None, convert_dates=None, if self.dtyplist[i] is not None: col = data.columns[i] dtype = data[col].dtype - if ((dtype != np.dtype(object)) and - (dtype != self.dtyplist[i])): + if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( (col, Series(data[col], index, self.dtyplist[i]))) From 240383c55d50a88635c11fdb21f8f109ce866565 Mon Sep 17 00:00:00 2001 From: agraboso Date: Wed, 10 Aug 2016 08:57:51 -0400 Subject: [PATCH 11/14] Properly close opened files in two tests --- pandas/io/tests/sas/test_sas7bdat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 6661d9fee5df0..b571d8d8b29ac 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -44,7 +44,8 @@ def test_from_buffer(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') tm.assert_frame_equal(df, df0, check_exact=False) @@ -54,7 +55,8 @@ def test_from_iterator(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') From 7aa51840efac99f97329373d945c244e4c14e26f Mon Sep 17 00:00:00 2001 From: agraboso Date: Wed, 10 Aug 2016 09:36:16 -0400 Subject: [PATCH 12/14] Properly close opened files in XportReader and SAS7BDATReader --- doc/source/whatsnew/v0.19.0.txt | 4 ++-- pandas/io/sas/sas7bdat.py | 17 +++++++++++++++++ pandas/io/sas/sas_xport.py | 9 +++++++++ pandas/io/sas/sasreader.py | 4 +++- pandas/io/tests/sas/test_sas7bdat.py | 1 + pandas/io/tests/sas/test_xport.py | 4 ++++ 6 files changed, 36 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d71d8a5a17bd3..b06b39380df64 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -902,8 +902,8 @@ Bug Fixes - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) -- Bug in ``pd.read_csv``, ``pd.read_table`` and ``pd.read_stata`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) -- Bug in ``StataReader`` and ``StataWriter`` where a file was not properly closed when an error was raised. (:issue:`13940`) +- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) +- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index b75f05cf9ed7e..2a82fd7a53222 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -92,16 +92,24 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') + self.handle = self._path_or_buf self._get_properties() self._parse_metadata() + def close(self): + try: + self.handle.close() + except AttributeError: + pass + def _get_properties(self): # Check magic number self._path_or_buf.seek(0) self._cached_page = self._path_or_buf.read(288) if self._cached_page[0:len(const.magic)] != const.magic: + self.close() raise ValueError("magic number mismatch (not a SAS file?)") # Get alignment information @@ -175,6 +183,7 @@ def _get_properties(self): buf = self._path_or_buf.read(self.header_length - 288) self._cached_page += buf if len(self._cached_page) != self.header_length: + self.close() raise ValueError("The SAS7BDAT file appears to be truncated.") self._page_length = self._read_int(const.page_size_offset + align1, @@ -219,6 +228,7 @@ def _get_properties(self): # Read a single float of the given width (4 or 8). def _read_float(self, offset, width): if width not in (4, 8): + self.close() raise ValueError("invalid float width") buf = self._read_bytes(offset, width) fd = "f" if width == 4 else "d" @@ -227,6 +237,7 @@ def _read_float(self, offset, width): # Read a single signed integer of the given width (1, 2, 4 or 8). def _read_int(self, offset, width): if width not in (1, 2, 4, 8): + self.close() raise ValueError("invalid int width") buf = self._read_bytes(offset, width) it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] @@ -238,11 +249,13 @@ def _read_bytes(self, offset, length): self._path_or_buf.seek(offset) buf = self._path_or_buf.read(length) if len(buf) < length: + self.close() msg = "Unable to read {:d} bytes from file position {:d}." raise ValueError(msg.format(length, offset)) return buf else: if offset + length > len(self._cached_page): + self.close() raise ValueError("The cached page is too small.") return self._cached_page[offset:offset + length] @@ -253,6 +266,7 @@ def _parse_metadata(self): if len(self._cached_page) <= 0: break if len(self._cached_page) != self._page_length: + self.close() raise ValueError( "Failed to read a meta data page from the SAS file.") done = self._process_page_meta() @@ -302,6 +316,7 @@ def _get_subheader_index(self, signature, compression, ptype): if (self.compression != "") and f1 and f2: index = const.index.dataSubheaderIndex else: + self.close() raise ValueError("Unknown subheader signature") return index @@ -598,6 +613,7 @@ def _read_next_page(self): if len(self._cached_page) <= 0: return True elif len(self._cached_page) != self._page_length: + self.close() msg = ("failed to read complete page from file " "(read {:d} of {:d} bytes)") raise ValueError(msg.format(len(self._cached_page), @@ -643,6 +659,7 @@ def _chunk_to_dataframe(self): rslt.loc[ii, name] = np.nan js += 1 else: + self.close() raise ValueError("unknown column type %s" % self.column_types[j]) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e4ca99fdcb109..76fc55154bc49 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -253,6 +253,9 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._read_header() + def close(self): + self.filepath_or_buffer.close() + def _get_row(self): return self.filepath_or_buffer.read(80).decode() @@ -262,6 +265,7 @@ def _read_header(self): # read file header line1 = self._get_row() if line1 != _correct_line1: + self.close() raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() @@ -269,6 +273,7 @@ def _read_header(self): ['_', 24], ['created', 16]] file_info = _split_line(line2, fif) if file_info['prefix'] != "SAS SAS SASLIB": + self.close() raise ValueError("Header record has invalid prefix.") file_info['created'] = _parse_date(file_info['created']) self.file_info = file_info @@ -282,6 +287,7 @@ def _read_header(self): headflag1 = header1.startswith(_correct_header1) headflag2 = (header2 == _correct_header2) if not (headflag1 and headflag2): + self.close() raise ValueError("Member header not found") # usually 140, could be 135 fieldnamelength = int(header1[-5:-2]) @@ -321,6 +327,7 @@ def _read_header(self): field['ntype'] = types[field['ntype']] fl = field['field_length'] if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): + self.close() msg = "Floating field width {0} is not between 2 and 8." raise TypeError(msg.format(fl)) @@ -335,6 +342,7 @@ def _read_header(self): header = self._get_row() if not header == _correct_obs_header: + self.close() raise ValueError("Observation header not found.") self.fields = fields @@ -425,6 +433,7 @@ def read(self, nrows=None): read_lines = min(nrows, self.nobs - self._lines_read) read_len = read_lines * self.record_length if read_len <= 0: + self.close() raise StopIteration raw = self.filepath_or_buffer.read(read_len) data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 9a60200c78893..081d780f71cb3 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -58,4 +58,6 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, if iterator or chunksize: return reader - return reader.read() + data = reader.read() + reader.close() + return data diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index b571d8d8b29ac..06eb9774679b1 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -81,6 +81,7 @@ def test_encoding_options(): from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() + rdr.close() for x, y in zip(df1.columns, df3.columns): assert(x == y.decode()) diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/io/tests/sas/test_xport.py index ae378c41cd24b..d0627a80f9604 100644 --- a/pandas/io/tests/sas/test_xport.py +++ b/pandas/io/tests/sas/test_xport.py @@ -39,11 +39,13 @@ def test1_basic(self): # Test incremental read with `read` method. reader = read_sas(self.file01, format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. reader = read_sas(self.file01, format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Read full file with `read_sas` method @@ -66,6 +68,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) @@ -73,6 +76,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) From 6592c7384e93c79d537b56e66547a8d720e841c6 Mon Sep 17 00:00:00 2001 From: agraboso Date: Wed, 10 Aug 2016 10:13:01 -0400 Subject: [PATCH 13/14] Do not acquire list as file handler to close --- pandas/io/parsers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a24a3f22f8325..57d6aaa571eec 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1766,16 +1766,18 @@ def __init__(self, f, **kwds): f = _get_handle(f, 'r', encoding=self.encoding, compression=self.compression, memory_map=self.memory_map) + self.handles.append(f) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) + self.handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding elif compat.PY3 and isinstance(f, compat.BytesIO): from io import TextIOWrapper f = TextIOWrapper(f, encoding=self.encoding) + self.handles.append(f) # Set self.data to something that can read lines. - self.handles.append(f) if hasattr(f, 'readline'): self._make_reader(f) else: From 3fa7d25a00372a1312cd8fad0a82082e997f51a4 Mon Sep 17 00:00:00 2001 From: agraboso Date: Wed, 10 Aug 2016 10:14:05 -0400 Subject: [PATCH 14/14] Close open files in TextFileReader upon StopIteration --- pandas/io/parsers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 57d6aaa571eec..5372203318d69 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -899,7 +899,11 @@ def _clean_options(self, options, engine): return result, engine def __next__(self): - return self.get_chunk() + try: + return self.get_chunk() + except StopIteration: + self.close() + raise def _make_engine(self, engine='c'): if engine == 'c':