Skip to content

Commit 4a80521

Browse files
agrabosojreback
authored andcommitted
BUG: properly close files opened by parsers
closes #13932 Author: agraboso <[email protected]> Closes #13940 from agraboso/fix-13932 and squashes the following commits: 3fa7d25 [agraboso] Close open files in TextFileReader upon StopIteration 6592c73 [agraboso] Do not acquire list as file handler to close 7aa5184 [agraboso] Properly close opened files in XportReader and SAS7BDATReader 240383c [agraboso] Properly close opened files in two tests 52d1073 [agraboso] Fix linting error 39dcd99 [agraboso] Fix rebase 75fc34d [agraboso] Make try/except blocks in StataReader.read as small as possible 812e6ec [agraboso] Fix long line c7e9c9c [agraboso] On close, CParserWrapper must call self._reader.close() 99e16dd [agraboso] Fix whatsnew entries 30b61e6 [agraboso] Properly close opened files in StataWriter 3b0f25f [agraboso] Properly close opened files in StataReader 1e39a5e [agraboso] Properly close opened files in three tests d759156 [agraboso] BUG: properly close files opened by parsers
1 parent a963139 commit 4a80521

14 files changed

+121
-48
lines changed

doc/source/whatsnew/v0.19.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,9 @@ Bug Fixes
905905
- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`)
906906
- Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
907907
- Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)
908+
- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`)
909+
- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`)
910+
908911
- Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`)
909912
- Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`)
910913
- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)

pandas/io/common.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,9 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
327327

328328
if memory_map and hasattr(f, 'fileno'):
329329
try:
330-
f = MMapWrapper(f)
330+
g = MMapWrapper(f)
331+
f.close()
332+
f = g
331333
except Exception:
332334
# we catch any errors that may have occurred
333335
# because that is consistent with the lower-level

pandas/io/parsers.py

+30-8
Original file line numberDiff line numberDiff line change
@@ -393,11 +393,15 @@ def _read(filepath_or_buffer, kwds):
393393
raise NotImplementedError("'nrows' and 'chunksize' cannot be used"
394394
" together yet.")
395395
elif nrows is not None:
396-
return parser.read(nrows)
396+
data = parser.read(nrows)
397+
parser.close()
398+
return data
397399
elif chunksize or iterator:
398400
return parser
399401

400-
return parser.read()
402+
data = parser.read()
403+
parser.close()
404+
return data
401405

402406
_parser_defaults = {
403407
'delimiter': None,
@@ -727,10 +731,7 @@ def __init__(self, f, engine=None, **kwds):
727731
self._make_engine(self.engine)
728732

729733
def close(self):
730-
try:
731-
self._engine._reader.close()
732-
except:
733-
pass
734+
self._engine.close()
734735

735736
def _get_options_with_defaults(self, engine):
736737
kwds = self.orig_options
@@ -898,7 +899,11 @@ def _clean_options(self, options, engine):
898899
return result, engine
899900

900901
def __next__(self):
901-
return self.get_chunk()
902+
try:
903+
return self.get_chunk()
904+
except StopIteration:
905+
self.close()
906+
raise
902907

903908
def _make_engine(self, engine='c'):
904909
if engine == 'c':
@@ -1057,8 +1062,13 @@ def __init__(self, kwds):
10571062

10581063
self._first_chunk = True
10591064

1065+
# GH 13932
1066+
# keep references to file handles opened by the parser itself
1067+
self.handles = []
1068+
10601069
def close(self):
1061-
self._reader.close()
1070+
for f in self.handles:
1071+
f.close()
10621072

10631073
@property
10641074
def _has_complex_date_col(self):
@@ -1356,6 +1366,7 @@ def __init__(self, src, **kwds):
13561366
if 'utf-16' in (kwds.get('encoding') or ''):
13571367
if isinstance(src, compat.string_types):
13581368
src = open(src, 'rb')
1369+
self.handles.append(src)
13591370
src = UTF8Recoder(src, kwds['encoding'])
13601371
kwds['encoding'] = 'utf-8'
13611372

@@ -1429,6 +1440,14 @@ def __init__(self, src, **kwds):
14291440

14301441
self._implicit_index = self._reader.leading_cols > 0
14311442

1443+
def close(self):
1444+
for f in self.handles:
1445+
f.close()
1446+
try:
1447+
self._reader.close()
1448+
except:
1449+
pass
1450+
14321451
def _set_noconvert_columns(self):
14331452
names = self.orig_names
14341453
usecols = self.usecols
@@ -1751,13 +1770,16 @@ def __init__(self, f, **kwds):
17511770
f = _get_handle(f, 'r', encoding=self.encoding,
17521771
compression=self.compression,
17531772
memory_map=self.memory_map)
1773+
self.handles.append(f)
17541774
elif self.compression:
17551775
f = _wrap_compressed(f, self.compression, self.encoding)
1776+
self.handles.append(f)
17561777
# in Python 3, convert BytesIO or fileobjects passed with an encoding
17571778
elif compat.PY3 and isinstance(f, compat.BytesIO):
17581779
from io import TextIOWrapper
17591780

17601781
f = TextIOWrapper(f, encoding=self.encoding)
1782+
self.handles.append(f)
17611783

17621784
# Set self.data to something that can read lines.
17631785
if hasattr(f, 'readline'):

pandas/io/sas/sas7bdat.py

+17
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,24 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
9292
self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
9393
if isinstance(self._path_or_buf, compat.string_types):
9494
self._path_or_buf = open(self._path_or_buf, 'rb')
95+
self.handle = self._path_or_buf
9596

9697
self._get_properties()
9798
self._parse_metadata()
9899

100+
def close(self):
101+
try:
102+
self.handle.close()
103+
except AttributeError:
104+
pass
105+
99106
def _get_properties(self):
100107

101108
# Check magic number
102109
self._path_or_buf.seek(0)
103110
self._cached_page = self._path_or_buf.read(288)
104111
if self._cached_page[0:len(const.magic)] != const.magic:
112+
self.close()
105113
raise ValueError("magic number mismatch (not a SAS file?)")
106114

107115
# Get alignment information
@@ -175,6 +183,7 @@ def _get_properties(self):
175183
buf = self._path_or_buf.read(self.header_length - 288)
176184
self._cached_page += buf
177185
if len(self._cached_page) != self.header_length:
186+
self.close()
178187
raise ValueError("The SAS7BDAT file appears to be truncated.")
179188

180189
self._page_length = self._read_int(const.page_size_offset + align1,
@@ -219,6 +228,7 @@ def _get_properties(self):
219228
# Read a single float of the given width (4 or 8).
220229
def _read_float(self, offset, width):
221230
if width not in (4, 8):
231+
self.close()
222232
raise ValueError("invalid float width")
223233
buf = self._read_bytes(offset, width)
224234
fd = "f" if width == 4 else "d"
@@ -227,6 +237,7 @@ def _read_float(self, offset, width):
227237
# Read a single signed integer of the given width (1, 2, 4 or 8).
228238
def _read_int(self, offset, width):
229239
if width not in (1, 2, 4, 8):
240+
self.close()
230241
raise ValueError("invalid int width")
231242
buf = self._read_bytes(offset, width)
232243
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
@@ -238,11 +249,13 @@ def _read_bytes(self, offset, length):
238249
self._path_or_buf.seek(offset)
239250
buf = self._path_or_buf.read(length)
240251
if len(buf) < length:
252+
self.close()
241253
msg = "Unable to read {:d} bytes from file position {:d}."
242254
raise ValueError(msg.format(length, offset))
243255
return buf
244256
else:
245257
if offset + length > len(self._cached_page):
258+
self.close()
246259
raise ValueError("The cached page is too small.")
247260
return self._cached_page[offset:offset + length]
248261

@@ -253,6 +266,7 @@ def _parse_metadata(self):
253266
if len(self._cached_page) <= 0:
254267
break
255268
if len(self._cached_page) != self._page_length:
269+
self.close()
256270
raise ValueError(
257271
"Failed to read a meta data page from the SAS file.")
258272
done = self._process_page_meta()
@@ -302,6 +316,7 @@ def _get_subheader_index(self, signature, compression, ptype):
302316
if (self.compression != "") and f1 and f2:
303317
index = const.index.dataSubheaderIndex
304318
else:
319+
self.close()
305320
raise ValueError("Unknown subheader signature")
306321
return index
307322

@@ -598,6 +613,7 @@ def _read_next_page(self):
598613
if len(self._cached_page) <= 0:
599614
return True
600615
elif len(self._cached_page) != self._page_length:
616+
self.close()
601617
msg = ("failed to read complete page from file "
602618
"(read {:d} of {:d} bytes)")
603619
raise ValueError(msg.format(len(self._cached_page),
@@ -643,6 +659,7 @@ def _chunk_to_dataframe(self):
643659
rslt.loc[ii, name] = np.nan
644660
js += 1
645661
else:
662+
self.close()
646663
raise ValueError("unknown column type %s" %
647664
self.column_types[j])
648665

pandas/io/sas/sas_xport.py

+9
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
253253

254254
self._read_header()
255255

256+
def close(self):
257+
self.filepath_or_buffer.close()
258+
256259
def _get_row(self):
257260
return self.filepath_or_buffer.read(80).decode()
258261

@@ -262,13 +265,15 @@ def _read_header(self):
262265
# read file header
263266
line1 = self._get_row()
264267
if line1 != _correct_line1:
268+
self.close()
265269
raise ValueError("Header record is not an XPORT file.")
266270

267271
line2 = self._get_row()
268272
fif = [['prefix', 24], ['version', 8], ['OS', 8],
269273
['_', 24], ['created', 16]]
270274
file_info = _split_line(line2, fif)
271275
if file_info['prefix'] != "SAS SAS SASLIB":
276+
self.close()
272277
raise ValueError("Header record has invalid prefix.")
273278
file_info['created'] = _parse_date(file_info['created'])
274279
self.file_info = file_info
@@ -282,6 +287,7 @@ def _read_header(self):
282287
headflag1 = header1.startswith(_correct_header1)
283288
headflag2 = (header2 == _correct_header2)
284289
if not (headflag1 and headflag2):
290+
self.close()
285291
raise ValueError("Member header not found")
286292
# usually 140, could be 135
287293
fieldnamelength = int(header1[-5:-2])
@@ -321,6 +327,7 @@ def _read_header(self):
321327
field['ntype'] = types[field['ntype']]
322328
fl = field['field_length']
323329
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
330+
self.close()
324331
msg = "Floating field width {0} is not between 2 and 8."
325332
raise TypeError(msg.format(fl))
326333

@@ -335,6 +342,7 @@ def _read_header(self):
335342

336343
header = self._get_row()
337344
if not header == _correct_obs_header:
345+
self.close()
338346
raise ValueError("Observation header not found.")
339347

340348
self.fields = fields
@@ -425,6 +433,7 @@ def read(self, nrows=None):
425433
read_lines = min(nrows, self.nobs - self._lines_read)
426434
read_len = read_lines * self.record_length
427435
if read_len <= 0:
436+
self.close()
428437
raise StopIteration
429438
raw = self.filepath_or_buffer.read(read_len)
430439
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)

pandas/io/sas/sasreader.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,6 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
5858
if iterator or chunksize:
5959
return reader
6060

61-
return reader.read()
61+
data = reader.read()
62+
reader.close()
63+
return data

0 commit comments

Comments
 (0)