From 1cd07049dcdb1a353639c5b746b3f1568cad50ca Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 10 Mar 2020 22:42:30 +0100 Subject: [PATCH 01/11] Fix file descriptor leak --- pandas/io/parsers.py | 14 +++++++++----- .../tests/io/parser/test_python_parser_only.py | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 50b5db0274aa5..37ba16bed004d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2273,11 +2273,15 @@ def __init__(self, f, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices = None - ( - self.columns, - self.num_original_columns, - self.unnamed_cols, - ) = self._infer_columns() + try: + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() + except EmptyDataError: + self.close() + raise # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 4d933fa02d36f..ba6dd5366281f 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -10,11 +10,13 @@ import pytest -from pandas.errors import ParserError +from pandas.errors import ParserError, EmptyDataError from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm +import psutil + def test_default_separator(python_parser_only): # see gh-17333 @@ -314,3 +316,15 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) + + +def test_file_descriptor_leak(python_parser_only): + # GH 31488 + parser = python_parser_only + with open("empty.csv", "w"): + pass + + proc = psutil.Process() + with pytest.raises(EmptyDataError): + parser.read_csv("empty.csv") + assert not proc.open_files() From 8454c3f464d7454795bc1678209daed0c3c5c9a9 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 10 Mar 2020 22:47:07 +0100 Subject: [PATCH 02/11] Add whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4e7bd5a2032a7..31010c98712ad 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -337,6 +337,7 @@ I/O - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) +- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) Plotting From 3bc072c4cf424b5c5abcc597e9e1b22eb4b82637 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 15:10:53 +0100 Subject: [PATCH 03/11] Use ensure_clean --- pandas/tests/io/parser/test_python_parser_only.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index ba6dd5366281f..49d1bf09d5cd9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -320,11 +320,10 @@ def test_malformed_skipfooter(python_parser_only): def test_file_descriptor_leak(python_parser_only): # GH 31488 - parser = python_parser_only - with open("empty.csv", "w"): - pass - proc = psutil.Process() - with pytest.raises(EmptyDataError): - parser.read_csv("empty.csv") - assert not proc.open_files() + parser = python_parser_only + with tm.ensure_clean() as path: + expected = proc.open_files() + with pytest.raises(EmptyDataError): + parser.read_csv(path) + assert proc.open_files() == expected From bf96995e7099181db7c3e8c2475e1a788205d9d2 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 15:19:28 +0100 Subject: [PATCH 04/11] Add the expected exception message --- pandas/tests/io/parser/test_python_parser_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 49d1bf09d5cd9..9364a4910cfbe 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -324,6 +324,6 @@ def test_file_descriptor_leak(python_parser_only): parser = python_parser_only with tm.ensure_clean() as path: expected = proc.open_files() - with pytest.raises(EmptyDataError): + with pytest.raises(EmptyDataError, match="No columns to parse from file"): parser.read_csv(path) assert proc.open_files() == expected From e23f02e53950e11d8472e4872b9e2f9d404b8f25 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 15:21:49 +0100 Subject: [PATCH 05/11] isort fixes --- pandas/tests/io/parser/test_python_parser_only.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 9364a4910cfbe..9084184dc426e 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -8,15 +8,14 @@ import csv from io import BytesIO, StringIO +import psutil import pytest -from pandas.errors import ParserError, EmptyDataError +from pandas.errors import EmptyDataError, ParserError from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm -import psutil - def test_default_separator(python_parser_only): # see gh-17333 From c284787f173dfae5842da7c7ad51fdbfed960bc6 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 15:28:56 +0100 Subject: [PATCH 06/11] Made the test non-python parser specific --- pandas/tests/io/parser/test_common.py | 12 ++++++++++++ pandas/tests/io/parser/test_python_parser_only.py | 14 +------------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b3aa1aa14a509..37ad7335e06ac 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -11,6 +11,7 @@ from urllib.error import URLError import numpy as np +import psutil import pytest from pandas._libs.tslib import Timestamp @@ -2079,3 +2080,14 @@ def test_integer_precision(all_parsers): result = parser.read_csv(StringIO(s), header=None)[4] expected = Series([4321583677327450765, 4321113141090630389], name=4) tm.assert_series_equal(result, expected) + + +def test_file_descriptor_leak(all_parsers): + # GH 31488 + proc = psutil.Process() + parser = all_parsers + with tm.ensure_clean() as path: + expected = proc.open_files() + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + parser.read_csv(path) + assert proc.open_files() == expected diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 9084184dc426e..4d933fa02d36f 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -8,10 +8,9 @@ import csv from io import BytesIO, StringIO -import psutil import pytest -from pandas.errors import EmptyDataError, ParserError +from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm @@ -315,14 +314,3 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) - - -def test_file_descriptor_leak(python_parser_only): - # GH 31488 - proc = psutil.Process() - parser = python_parser_only - with tm.ensure_clean() as path: - expected = proc.open_files() - with pytest.raises(EmptyDataError, match="No columns to parse from file"): - parser.read_csv(path) - assert proc.open_files() == expected From e7f07df789b1591a7454ba96a67175bf593f2a8d Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 17:19:06 +0100 Subject: [PATCH 07/11] import psutil fails on several travis runs due to missing psutil --- pandas/tests/io/parser/test_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 37ad7335e06ac..fc87158b5abfb 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -11,9 +11,10 @@ from urllib.error import URLError import numpy as np -import psutil import pytest +import pandas.util._test_decorators as td + from pandas._libs.tslib import Timestamp from pandas.errors import DtypeWarning, EmptyDataError, ParserError @@ -2082,8 +2083,10 @@ def test_integer_precision(all_parsers): tm.assert_series_equal(result, expected) +@td.skip_if_no("psutil") def test_file_descriptor_leak(all_parsers): # GH 31488 + import psutil proc = psutil.Process() parser = all_parsers with tm.ensure_clean() as path: From 2203c716be16af680d2401abcbf89aa9351bff86 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 18:26:26 +0100 Subject: [PATCH 08/11] Fix import order --- pandas/tests/io/parser/test_common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index fc87158b5abfb..23b733056af1e 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -13,10 +13,9 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas._libs.tslib import Timestamp from pandas.errors import DtypeWarning, EmptyDataError, ParserError +import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, compat, concat import pandas._testing as tm From 996d474aa787e0cec63086552d3684975a71c5f2 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 18:54:20 +0100 Subject: [PATCH 09/11] Fix for black formatter --- pandas/tests/io/parser/test_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 23b733056af1e..d99d3c033f66a 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2086,6 +2086,7 @@ def test_integer_precision(all_parsers): def test_file_descriptor_leak(all_parsers): # GH 31488 import psutil + proc = psutil.Process() parser = all_parsers with tm.ensure_clean() as path: From 8fa751ecf3ce8bc6acf0d73b9d98c7b6f5925310 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sat, 14 Mar 2020 17:43:19 +0100 Subject: [PATCH 10/11] Add TypeError and ValueError to the list of caught exceptions Also add a comment why combining tm.ensure_clean and td.check_file_leaks don't play along nicely together --- pandas/io/parsers.py | 2 +- pandas/tests/io/parser/test_common.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 37ba16bed004d..52783b3a9e134 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2279,7 +2279,7 @@ def __init__(self, f, **kwds): self.num_original_columns, self.unnamed_cols, ) = self._infer_columns() - except EmptyDataError: + except (TypeError, ValueError): self.close() raise diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index d99d3c033f66a..12196aeb1633e 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2085,6 +2085,9 @@ def test_integer_precision(all_parsers): @td.skip_if_no("psutil") def test_file_descriptor_leak(all_parsers): # GH 31488 + # Please note that using tm.ensure_clean does not work in combination + # with td.check_file_leaks as tm.ensure_clean closes the file + # thereby preventing the leak check to trigger import psutil proc = psutil.Process() From d08bc10ca07806d677e061e5309b5cb6093c6fdb Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sat, 14 Mar 2020 23:58:19 +0100 Subject: [PATCH 11/11] Use td.check_file_leaks() in a way that works in combination with tm.ensure_clean --- pandas/tests/io/parser/test_common.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 12196aeb1633e..33460262a4430 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2082,18 +2082,14 @@ def test_integer_precision(all_parsers): tm.assert_series_equal(result, expected) -@td.skip_if_no("psutil") def test_file_descriptor_leak(all_parsers): # GH 31488 - # Please note that using tm.ensure_clean does not work in combination - # with td.check_file_leaks as tm.ensure_clean closes the file - # thereby preventing the leak check to trigger - import psutil - proc = psutil.Process() parser = all_parsers with tm.ensure_clean() as path: - expected = proc.open_files() - with pytest.raises(EmptyDataError, match="No columns to parse from file"): - parser.read_csv(path) - assert proc.open_files() == expected + + def test(): + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + parser.read_csv(path) + + td.check_file_leaks(test)()