pandas-dev · agraboso · Aug 8, 2016 · Aug 9, 2016 · Aug 9, 2016 · Aug 9, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -902,6 +902,9 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`)
 - Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
 - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)
+- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`)
+- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`)
+
 - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`)
 - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`)
 - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -327,7 +327,9 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
 
     if memory_map and hasattr(f, 'fileno'):
         try:
-            f = MMapWrapper(f)
+            g = MMapWrapper(f)
+            f.close()
+            f = g
         except Exception:
             # we catch any errors that may have occurred
             # because that is consistent with the lower-level

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -393,11 +393,15 @@ def _read(filepath_or_buffer, kwds):
         raise NotImplementedError("'nrows' and 'chunksize' cannot be used"
                                   " together yet.")
     elif nrows is not None:
-        return parser.read(nrows)
+        data = parser.read(nrows)
+        parser.close()
+        return data
     elif chunksize or iterator:
         return parser
 
-    return parser.read()
+    data = parser.read()
+    parser.close()
+    return data
 
 _parser_defaults = {
     'delimiter': None,
@@ -727,10 +731,7 @@ def __init__(self, f, engine=None, **kwds):
         self._make_engine(self.engine)
 
     def close(self):
-        try:
-            self._engine._reader.close()
-        except:
-            pass
+        self._engine.close()
 
     def _get_options_with_defaults(self, engine):
         kwds = self.orig_options
@@ -898,7 +899,11 @@ def _clean_options(self, options, engine):
         return result, engine
 
     def __next__(self):
-        return self.get_chunk()
+        try:
+            return self.get_chunk()
+        except StopIteration:
+            self.close()
+            raise
 
     def _make_engine(self, engine='c'):
         if engine == 'c':
@@ -1057,8 +1062,13 @@ def __init__(self, kwds):
 
         self._first_chunk = True
 
+        # GH 13932
+        # keep references to file handles opened by the parser itself
+        self.handles = []
+
     def close(self):
-        self._reader.close()
+        for f in self.handles:
+            f.close()
 
     @property
     def _has_complex_date_col(self):
@@ -1356,6 +1366,7 @@ def __init__(self, src, **kwds):
         if 'utf-16' in (kwds.get('encoding') or ''):
             if isinstance(src, compat.string_types):
                 src = open(src, 'rb')
+                self.handles.append(src)
             src = UTF8Recoder(src, kwds['encoding'])
             kwds['encoding'] = 'utf-8'
 
@@ -1429,6 +1440,14 @@ def __init__(self, src, **kwds):
 
         self._implicit_index = self._reader.leading_cols > 0
 
+    def close(self):
+        for f in self.handles:
+            f.close()
+        try:
+            self._reader.close()
+        except:
+            pass
+
     def _set_noconvert_columns(self):
         names = self.orig_names
         usecols = self.usecols
@@ -1751,13 +1770,16 @@ def __init__(self, f, **kwds):
             f = _get_handle(f, 'r', encoding=self.encoding,
                             compression=self.compression,
                             memory_map=self.memory_map)
+            self.handles.append(f)
         elif self.compression:
             f = _wrap_compressed(f, self.compression, self.encoding)
+            self.handles.append(f)
         # in Python 3, convert BytesIO or fileobjects passed with an encoding
         elif compat.PY3 and isinstance(f, compat.BytesIO):
             from io import TextIOWrapper
 
             f = TextIOWrapper(f, encoding=self.encoding)
+            self.handles.append(f)
 
         # Set self.data to something that can read lines.
         if hasattr(f, 'readline'):

diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -92,16 +92,24 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
         self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
         if isinstance(self._path_or_buf, compat.string_types):
             self._path_or_buf = open(self._path_or_buf, 'rb')
+            self.handle = self._path_or_buf
 
         self._get_properties()
         self._parse_metadata()
 
+    def close(self):
+        try:
+            self.handle.close()
+        except AttributeError:
+            pass
+
     def _get_properties(self):
 
         # Check magic number
         self._path_or_buf.seek(0)
         self._cached_page = self._path_or_buf.read(288)
         if self._cached_page[0:len(const.magic)] != const.magic:
+            self.close()
             raise ValueError("magic number mismatch (not a SAS file?)")
 
         # Get alignment information
@@ -175,6 +183,7 @@ def _get_properties(self):
         buf = self._path_or_buf.read(self.header_length - 288)
         self._cached_page += buf
         if len(self._cached_page) != self.header_length:
+            self.close()
             raise ValueError("The SAS7BDAT file appears to be truncated.")
 
         self._page_length = self._read_int(const.page_size_offset + align1,
@@ -219,6 +228,7 @@ def _get_properties(self):
     # Read a single float of the given width (4 or 8).
     def _read_float(self, offset, width):
         if width not in (4, 8):
+            self.close()
             raise ValueError("invalid float width")
         buf = self._read_bytes(offset, width)
         fd = "f" if width == 4 else "d"
@@ -227,6 +237,7 @@ def _read_float(self, offset, width):
     # Read a single signed integer of the given width (1, 2, 4 or 8).
     def _read_int(self, offset, width):
         if width not in (1, 2, 4, 8):
+            self.close()
             raise ValueError("invalid int width")
         buf = self._read_bytes(offset, width)
         it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
@@ -238,11 +249,13 @@ def _read_bytes(self, offset, length):
             self._path_or_buf.seek(offset)
             buf = self._path_or_buf.read(length)
             if len(buf) < length:
+                self.close()
                 msg = "Unable to read {:d} bytes from file position {:d}."
                 raise ValueError(msg.format(length, offset))
             return buf
         else:
             if offset + length > len(self._cached_page):
+                self.close()
                 raise ValueError("The cached page is too small.")
             return self._cached_page[offset:offset + length]
 
@@ -253,6 +266,7 @@ def _parse_metadata(self):
             if len(self._cached_page) <= 0:
                 break
             if len(self._cached_page) != self._page_length:
+                self.close()
                 raise ValueError(
                     "Failed to read a meta data page from the SAS file.")
             done = self._process_page_meta()
@@ -302,6 +316,7 @@ def _get_subheader_index(self, signature, compression, ptype):
             if (self.compression != "") and f1 and f2:
                 index = const.index.dataSubheaderIndex
             else:
+                self.close()
                 raise ValueError("Unknown subheader signature")
         return index
 
@@ -598,6 +613,7 @@ def _read_next_page(self):
         if len(self._cached_page) <= 0:
             return True
         elif len(self._cached_page) != self._page_length:
+            self.close()
             msg = ("failed to read complete page from file "
                    "(read {:d} of {:d} bytes)")
             raise ValueError(msg.format(len(self._cached_page),
@@ -643,6 +659,7 @@ def _chunk_to_dataframe(self):
                     rslt.loc[ii, name] = np.nan
                 js += 1
             else:
+                self.close()
                 raise ValueError("unknown column type %s" %
                                  self.column_types[j])
 

diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
@@ -253,6 +253,9 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
 
         self._read_header()
 
+    def close(self):
+        self.filepath_or_buffer.close()
+
     def _get_row(self):
         return self.filepath_or_buffer.read(80).decode()
 
@@ -262,13 +265,15 @@ def _read_header(self):
         # read file header
         line1 = self._get_row()
         if line1 != _correct_line1:
+            self.close()
             raise ValueError("Header record is not an XPORT file.")
 
         line2 = self._get_row()
         fif = [['prefix', 24], ['version', 8], ['OS', 8],
                ['_', 24], ['created', 16]]
         file_info = _split_line(line2, fif)
         if file_info['prefix'] != "SAS     SAS     SASLIB":
+            self.close()
             raise ValueError("Header record has invalid prefix.")
         file_info['created'] = _parse_date(file_info['created'])
         self.file_info = file_info
@@ -282,6 +287,7 @@ def _read_header(self):
         headflag1 = header1.startswith(_correct_header1)
         headflag2 = (header2 == _correct_header2)
         if not (headflag1 and headflag2):
+            self.close()
             raise ValueError("Member header not found")
         # usually 140, could be 135
         fieldnamelength = int(header1[-5:-2])
@@ -321,6 +327,7 @@ def _read_header(self):
             field['ntype'] = types[field['ntype']]
             fl = field['field_length']
             if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
+                self.close()
                 msg = "Floating field width {0} is not between 2 and 8."
                 raise TypeError(msg.format(fl))
 
@@ -335,6 +342,7 @@ def _read_header(self):
 
         header = self._get_row()
         if not header == _correct_obs_header:
+            self.close()
             raise ValueError("Observation header not found.")
 
         self.fields = fields
@@ -425,6 +433,7 @@ def read(self, nrows=None):
         read_lines = min(nrows, self.nobs - self._lines_read)
         read_len = read_lines * self.record_length
         if read_len <= 0:
+            self.close()
             raise StopIteration
         raw = self.filepath_or_buffer.read(read_len)
         data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)

diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
@@ -58,4 +58,6 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
     if iterator or chunksize:
         return reader
 
-    return reader.read()
+    data = reader.read()
+    reader.close()
+    return data