diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 4f81eafa3adaf..b06b39380df64 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -902,6 +902,9 @@ Bug Fixes - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) +- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) + - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6f9bddd0fdf9b..b7ac183b7ab41 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -327,7 +327,9 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): if memory_map and hasattr(f, 'fileno'): try: - f = MMapWrapper(f) + g = MMapWrapper(f) + f.close() + f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7846ccd1a6660..5372203318d69 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -393,11 +393,15 @@ def _read(filepath_or_buffer, kwds): raise NotImplementedError("'nrows' and 'chunksize' cannot be used" " together yet.") elif nrows is not None: - return parser.read(nrows) + data = parser.read(nrows) + parser.close() + return data elif chunksize or iterator: return parser - return parser.read() + data = parser.read() + parser.close() + return data _parser_defaults = { 'delimiter': None, @@ -727,10 +731,7 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - try: - self._engine._reader.close() - except: - pass + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -898,7 +899,11 @@ def _clean_options(self, options, engine): return result, engine def __next__(self): - return self.get_chunk() + try: + return self.get_chunk() + except StopIteration: + self.close() + raise def _make_engine(self, engine='c'): if engine == 'c': @@ -1057,8 +1062,13 @@ def __init__(self, kwds): self._first_chunk = True + # GH 13932 + # keep references to file handles opened by the parser itself + self.handles = [] + def close(self): - self._reader.close() + for f in self.handles: + f.close() @property def _has_complex_date_col(self): @@ -1356,6 +1366,7 @@ def __init__(self, src, **kwds): if 'utf-16' in (kwds.get('encoding') or ''): if isinstance(src, compat.string_types): src = open(src, 'rb') + self.handles.append(src) src = UTF8Recoder(src, kwds['encoding']) kwds['encoding'] = 'utf-8' @@ -1429,6 +1440,14 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 + def close(self): + for f in self.handles: + f.close() + try: + self._reader.close() + except: + pass + def _set_noconvert_columns(self): names = self.orig_names usecols = self.usecols @@ -1751,13 +1770,16 @@ def __init__(self, f, **kwds): f = _get_handle(f, 'r', encoding=self.encoding, compression=self.compression, memory_map=self.memory_map) + self.handles.append(f) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) + self.handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding elif compat.PY3 and isinstance(f, compat.BytesIO): from io import TextIOWrapper f = TextIOWrapper(f, encoding=self.encoding) + self.handles.append(f) # Set self.data to something that can read lines. if hasattr(f, 'readline'): diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index b75f05cf9ed7e..2a82fd7a53222 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -92,16 +92,24 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') + self.handle = self._path_or_buf self._get_properties() self._parse_metadata() + def close(self): + try: + self.handle.close() + except AttributeError: + pass + def _get_properties(self): # Check magic number self._path_or_buf.seek(0) self._cached_page = self._path_or_buf.read(288) if self._cached_page[0:len(const.magic)] != const.magic: + self.close() raise ValueError("magic number mismatch (not a SAS file?)") # Get alignment information @@ -175,6 +183,7 @@ def _get_properties(self): buf = self._path_or_buf.read(self.header_length - 288) self._cached_page += buf if len(self._cached_page) != self.header_length: + self.close() raise ValueError("The SAS7BDAT file appears to be truncated.") self._page_length = self._read_int(const.page_size_offset + align1, @@ -219,6 +228,7 @@ def _get_properties(self): # Read a single float of the given width (4 or 8). def _read_float(self, offset, width): if width not in (4, 8): + self.close() raise ValueError("invalid float width") buf = self._read_bytes(offset, width) fd = "f" if width == 4 else "d" @@ -227,6 +237,7 @@ def _read_float(self, offset, width): # Read a single signed integer of the given width (1, 2, 4 or 8). def _read_int(self, offset, width): if width not in (1, 2, 4, 8): + self.close() raise ValueError("invalid int width") buf = self._read_bytes(offset, width) it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] @@ -238,11 +249,13 @@ def _read_bytes(self, offset, length): self._path_or_buf.seek(offset) buf = self._path_or_buf.read(length) if len(buf) < length: + self.close() msg = "Unable to read {:d} bytes from file position {:d}." raise ValueError(msg.format(length, offset)) return buf else: if offset + length > len(self._cached_page): + self.close() raise ValueError("The cached page is too small.") return self._cached_page[offset:offset + length] @@ -253,6 +266,7 @@ def _parse_metadata(self): if len(self._cached_page) <= 0: break if len(self._cached_page) != self._page_length: + self.close() raise ValueError( "Failed to read a meta data page from the SAS file.") done = self._process_page_meta() @@ -302,6 +316,7 @@ def _get_subheader_index(self, signature, compression, ptype): if (self.compression != "") and f1 and f2: index = const.index.dataSubheaderIndex else: + self.close() raise ValueError("Unknown subheader signature") return index @@ -598,6 +613,7 @@ def _read_next_page(self): if len(self._cached_page) <= 0: return True elif len(self._cached_page) != self._page_length: + self.close() msg = ("failed to read complete page from file " "(read {:d} of {:d} bytes)") raise ValueError(msg.format(len(self._cached_page), @@ -643,6 +659,7 @@ def _chunk_to_dataframe(self): rslt.loc[ii, name] = np.nan js += 1 else: + self.close() raise ValueError("unknown column type %s" % self.column_types[j]) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e4ca99fdcb109..76fc55154bc49 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -253,6 +253,9 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._read_header() + def close(self): + self.filepath_or_buffer.close() + def _get_row(self): return self.filepath_or_buffer.read(80).decode() @@ -262,6 +265,7 @@ def _read_header(self): # read file header line1 = self._get_row() if line1 != _correct_line1: + self.close() raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() @@ -269,6 +273,7 @@ def _read_header(self): ['_', 24], ['created', 16]] file_info = _split_line(line2, fif) if file_info['prefix'] != "SAS SAS SASLIB": + self.close() raise ValueError("Header record has invalid prefix.") file_info['created'] = _parse_date(file_info['created']) self.file_info = file_info @@ -282,6 +287,7 @@ def _read_header(self): headflag1 = header1.startswith(_correct_header1) headflag2 = (header2 == _correct_header2) if not (headflag1 and headflag2): + self.close() raise ValueError("Member header not found") # usually 140, could be 135 fieldnamelength = int(header1[-5:-2]) @@ -321,6 +327,7 @@ def _read_header(self): field['ntype'] = types[field['ntype']] fl = field['field_length'] if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): + self.close() msg = "Floating field width {0} is not between 2 and 8." raise TypeError(msg.format(fl)) @@ -335,6 +342,7 @@ def _read_header(self): header = self._get_row() if not header == _correct_obs_header: + self.close() raise ValueError("Observation header not found.") self.fields = fields @@ -425,6 +433,7 @@ def read(self, nrows=None): read_lines = min(nrows, self.nobs - self._lines_read) read_len = read_lines * self.record_length if read_len <= 0: + self.close() raise StopIteration raw = self.filepath_or_buffer.read(read_len) data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 9a60200c78893..081d780f71cb3 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -58,4 +58,6 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, if iterator or chunksize: return reader - return reader.read() + data = reader.read() + reader.close() + return data diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 59bc24acac6f8..e831bb2b95bf6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -167,15 +167,11 @@ def read_stata(filepath_or_buffer, convert_dates=True, chunksize=chunksize, encoding=encoding) if iterator or chunksize: - try: - return reader - except StopIteration: - reader.close() - - try: - return reader.read() - finally: + data = reader + else: + data = reader.read() reader.close() + return data _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -1411,13 +1407,13 @@ def read(self, nrows=None, convert_dates=None, convert_categoricals=None, index=None, convert_missing=None, preserve_dtypes=None, columns=None, order_categoricals=None): - # Handle empty file or chunk. If reading incrementally raise # StopIteration. If reading the whole thing return an empty # data frame. if (self.nobs == 0) and (nrows is None): self._can_read_value_labels = True self._data_read = True + self.close() return DataFrame(columns=self.varlist) # Handle options @@ -1463,6 +1459,7 @@ def read(self, nrows=None, convert_dates=None, # we are reading the file incrementally if convert_categoricals: self._read_value_labels() + self.close() raise StopIteration offset = self._lines_read * dtype.itemsize self.path_or_buf.seek(self.data_location + offset) @@ -1494,7 +1491,11 @@ def read(self, nrows=None, convert_dates=None, data = data.set_index(ix) if columns is not None: - data = self._do_select_columns(data, columns) + try: + data = self._do_select_columns(data, columns) + except ValueError: + self.close() + raise # Decode strings for col, typ in zip(data, self.typlist): @@ -1514,7 +1515,7 @@ def read(self, nrows=None, convert_dates=None, if self.dtyplist[i] is not None: col = data.columns[i] dtype = data[col].dtype - if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): + if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( (col, Series(data[col], index, self.dtyplist[i]))) @@ -1531,9 +1532,13 @@ def read(self, nrows=None, convert_dates=None, self.fmtlist))[0] for i in cols: col = data.columns[i] - data[col] = _stata_elapsed_date_to_datetime_vec( - data[col], - self.fmtlist[i]) + try: + data[col] = _stata_elapsed_date_to_datetime_vec( + data[col], + self.fmtlist[i]) + except ValueError: + self.close() + raise if convert_categoricals and self.format_version > 108: data = self._do_convert_categoricals(data, @@ -1881,9 +1886,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._file = _open_file_binary_write( - fname, self._encoding or self._default_encoding - ) + self._fname = fname self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): @@ -2078,16 +2081,21 @@ def _prepare_pandas(self, data): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header(time_stamp=self._time_stamp, - data_label=self._data_label) - self._write_descriptors() - self._write_variable_labels() - # write 5 zeros for expansion fields - self._write(_pad_bytes("", 5)) - self._prepare_data() - self._write_data() - self._write_value_labels() - self._file.close() + self._file = _open_file_binary_write( + self._fname, self._encoding or self._default_encoding + ) + try: + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) + self._write_descriptors() + self._write_variable_labels() + # write 5 zeros for expansion fields + self._write(_pad_bytes("", 5)) + self._prepare_data() + self._write_data() + self._write_value_labels() + finally: + self._file.close() def _write_value_labels(self): for vl in self._value_labels: diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 619ac7b4c77ef..96eb0ec6fd7a2 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1580,5 +1580,6 @@ def test_temporary_file(self): new_file.seek(0) result = self.read_csv(new_file, sep='\s+', header=None) + new_file.close() expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index a7389fd174e1d..3214aa39358e8 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -130,7 +130,8 @@ def test_decompression_regex_sep(self): except ImportError: raise nose.SkipTest('need gzip and bz2 to run') - data = open(self.csv1, 'rb').read() + with open(self.csv1, 'rb') as f: + data = f.read() data = data.replace(b',', b'::') expected = self.read_csv(self.csv1) diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index fd2f49cef656a..7dda9eb9d0af4 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -54,7 +54,8 @@ def test_file_handle_mmap(self): f.close() def test_StringIO(self): - text = open(self.csv1, 'rb').read() + with open(self.csv1, 'rb') as f: + text = f.read() src = BytesIO(text) reader = TextReader(src, header=None) reader.read() diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 6661d9fee5df0..06eb9774679b1 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -44,7 +44,8 @@ def test_from_buffer(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') tm.assert_frame_equal(df, df0, check_exact=False) @@ -54,7 +55,8 @@ def test_from_iterator(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') @@ -79,6 +81,7 @@ def test_encoding_options(): from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() + rdr.close() for x, y in zip(df1.columns, df3.columns): assert(x == y.decode()) diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/io/tests/sas/test_xport.py index ae378c41cd24b..d0627a80f9604 100644 --- a/pandas/io/tests/sas/test_xport.py +++ b/pandas/io/tests/sas/test_xport.py @@ -39,11 +39,13 @@ def test1_basic(self): # Test incremental read with `read` method. reader = read_sas(self.file01, format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. reader = read_sas(self.file01, format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Read full file with `read_sas` method @@ -66,6 +68,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) @@ -73,6 +76,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index a443df5dac586..c08d235b07c9e 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -116,8 +116,8 @@ def test_constructor_bad_file(self): tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target) def test_get_attr(self): - target = open(self.mmap_file, 'r') - wrapper = common.MMapWrapper(target) + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) attrs = [attr for attr in attrs @@ -130,10 +130,9 @@ def test_get_attr(self): self.assertFalse(hasattr(wrapper, 'foo')) def test_next(self): - target = open(self.mmap_file, 'r') - wrapper = common.MMapWrapper(target) - - lines = target.readlines() + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) + lines = target.readlines() for line in lines: next_line = next(wrapper) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f89501d39f014..48528dc54adbd 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -64,7 +64,8 @@ def test_to_csv(self): with ensure_clean() as path: self.ts.to_csv(path) - lines = io.open(path, newline=None).readlines() + with io.open(path, newline=None) as f: + lines = f.readlines() assert (lines[1] != '\n') self.ts.to_csv(path, index=False)