From f5756bed29d766553094ad34e4181b51e70b3c35 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 23:18:33 +0100 Subject: [PATCH 01/16] Generate exception from the C code in the proper manner Get rid of all error printf's and produce proper Python exceptions --- pandas/_libs/parsers.pyx | 12 +----------- pandas/_libs/src/parser/io.c | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4f7d75e0aaad6..eb8e50a66a080 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -241,7 +241,7 @@ cdef extern from "parser/io.h": void* buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) - void *new_file_source(char *fname, size_t buffer_size) + void *new_file_source(char *fname, size_t buffer_size) except NULL void *new_rd_source(object obj) @@ -667,16 +667,6 @@ cdef class TextReader: ptr = new_file_source(source, self.parser.chunksize) self.parser.cb_io = &buffer_file_bytes self.parser.cb_cleanup = &del_file_source - - if ptr == NULL: - if not os.path.exists(source): - - raise FileNotFoundError( - ENOENT, - f'File {usource} does not exist', - usource) - raise IOError('Initializing from file failed') - self.parser.source = ptr elif hasattr(source, 'read'): diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 1e3295fcb6fc7..e9773dcf14dbc 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -28,6 +28,7 @@ The full license is in the LICENSE file, distributed with this software. void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); if (fs == NULL) { + PyErr_NoMemory(); return NULL; } @@ -41,17 +42,20 @@ void *new_file_source(char *fname, size_t buffer_size) { int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); if (required == 0) { free(fs); + PyErr_SetFromWindowsErr(GetLastError()) return NULL; } wname = (wchar_t*)malloc(required * sizeof(wchar_t)); if (wname == NULL) { free(fs); + PyErr_NoMemory(); return NULL; } if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < required) { free(wname); free(fs); + PyErr_SetFromWindowsErr(GetLastError()) return NULL; } fs->fd = _wopen(wname, O_RDONLY | O_BINARY); @@ -62,6 +66,11 @@ void *new_file_source(char *fname, size_t buffer_size) { #endif if (fs->fd == -1) { free(fs); +#ifdef USE_WIN_UTF16 + PyErr_SetFromWindowsErr(GetLastError()) +#else + PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); +#endif return NULL; } @@ -71,6 +80,7 @@ void *new_file_source(char *fname, size_t buffer_size) { if (fs->buffer == NULL) { close(fs->fd); free(fs); + PyErr_NoMemory(); return NULL; } @@ -83,6 +93,10 @@ void *new_file_source(char *fname, size_t buffer_size) { void *new_rd_source(PyObject *obj) { rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); + if (rds == NULL) { + PyErr_NoMemory(); + return NULL; + } /* hold on to this object */ Py_INCREF(obj); rds->obj = obj; @@ -220,20 +234,18 @@ void *new_mmap(char *fname) { mm = (memory_map *)malloc(sizeof(memory_map)); if (mm == NULL) { - fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return (NULL); + PyErr_NoMemory(); + return NULL; } mm->fd = open(fname, O_RDONLY | O_BINARY); if (mm->fd == -1) { - fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", - fname, errno); + PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); free(mm); return NULL; } if (fstat(mm->fd, &stat) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", - errno); + PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); close(mm->fd); free(mm); return NULL; @@ -242,8 +254,7 @@ void *new_mmap(char *fname) { mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); if (mm->memmap == MAP_FAILED) { - /* XXX Eventually remove this print statement. */ - fprintf(stderr, "new_file_buffer: mmap() failed.\n"); + PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); close(mm->fd); free(mm); return NULL; From b5aa527596d6d216fab65ef771465aee01ec4ea2 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 23:24:49 +0100 Subject: [PATCH 02/16] Declare some more exceptions from C code --- pandas/_libs/parsers.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index eb8e50a66a080..cb84c385ac5ee 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -236,14 +236,14 @@ cdef extern from "parser/tokenizer.h": cdef extern from "parser/io.h": - void *new_mmap(char *fname) + void *new_mmap(char *fname) except NULL int del_mmap(void *src) void* buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) void *new_file_source(char *fname, size_t buffer_size) except NULL - void *new_rd_source(object obj) + void *new_rd_source(object obj) except NULL int del_file_source(void *src) int del_rd_source(void *src) From c9943e040273fa8a1e12234f65aab3d6b68c3a5d Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 22:23:15 +0100 Subject: [PATCH 03/16] Remove special case error message for c parser --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 0f3a5be76ae60..3ba2f4ad882f0 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -960,7 +960,7 @@ def test_nonexistent_path(all_parsers): parser = all_parsers path = f"{tm.rands(10)}.csv" - msg = f"File {path} does not exist" if parser.engine == "c" else r"\[Errno 2\]" + msg = r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) From dae788656d588ed9c848479681703156780aa9ea Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 22:29:02 +0100 Subject: [PATCH 04/16] Add whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0d3a9a8f969a4..877e2d9fd35b7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -346,6 +346,7 @@ I/O - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) - Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) +- Bug in :meth:`read_csv` was raising a misleading exception on permission issue (:issue:`23784`) Plotting From 6e5852ece56a9846ad6e85ba6ac4d1966d5b4ec7 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 22:38:48 +0100 Subject: [PATCH 05/16] Fix missing semicolons --- pandas/_libs/src/parser/io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index e9773dcf14dbc..89fbac8582e7e 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -42,7 +42,7 @@ void *new_file_source(char *fname, size_t buffer_size) { int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); if (required == 0) { free(fs); - PyErr_SetFromWindowsErr(GetLastError()) + PyErr_SetFromWindowsErr(GetLastError()); return NULL; } wname = (wchar_t*)malloc(required * sizeof(wchar_t)); @@ -55,7 +55,7 @@ void *new_file_source(char *fname, size_t buffer_size) { required) { free(wname); free(fs); - PyErr_SetFromWindowsErr(GetLastError()) + PyErr_SetFromWindowsErr(GetLastError()); return NULL; } fs->fd = _wopen(wname, O_RDONLY | O_BINARY); @@ -67,7 +67,7 @@ void *new_file_source(char *fname, size_t buffer_size) { if (fs->fd == -1) { free(fs); #ifdef USE_WIN_UTF16 - PyErr_SetFromWindowsErr(GetLastError()) + PyErr_SetFromWindowsErr(GetLastError()); #else PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); #endif From a1a60ef39a003516d09a9c4adda2b0035cf7c699 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 23:09:12 +0100 Subject: [PATCH 06/16] Add regression test --- pandas/tests/io/parser/test_common.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 3ba2f4ad882f0..e61d401ff0524 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -969,6 +969,18 @@ def test_nonexistent_path(all_parsers): assert path == filename +def test_no_permission(all_parsers): + # GH 23784 + parser = all_parsers + + msg = r"\[Errno 13\]" + with tm.ensure_clean() as path: + os.chmod(path, 0) # make file unreadable + with pytest.raises(PermissionError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename + + def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D From 15f424b01785dac41b0ae6547d71852e5d348bf4 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 23:15:39 +0100 Subject: [PATCH 07/16] Remove special case handling for Windows PyErr_SetFromErrnoWithFilename works for Unix and Windows --- pandas/_libs/src/parser/io.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 89fbac8582e7e..7ddf25fea90df 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -66,11 +66,7 @@ void *new_file_source(char *fname, size_t buffer_size) { #endif if (fs->fd == -1) { free(fs); -#ifdef USE_WIN_UTF16 - PyErr_SetFromWindowsErr(GetLastError()); -#else PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); -#endif return NULL; } From 42c2b535558728bdda431d2c4c0c5b380c29e343 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 23:20:06 +0100 Subject: [PATCH 08/16] Remove call to GetLastError(), when using 0, the python error code handles this --- pandas/_libs/src/parser/io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 7ddf25fea90df..8640da4bcb725 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -42,7 +42,7 @@ void *new_file_source(char *fname, size_t buffer_size) { int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); if (required == 0) { free(fs); - PyErr_SetFromWindowsErr(GetLastError()); + PyErr_SetFromWindowsErr(0); return NULL; } wname = (wchar_t*)malloc(required * sizeof(wchar_t)); @@ -55,7 +55,7 @@ void *new_file_source(char *fname, size_t buffer_size) { required) { free(wname); free(fs); - PyErr_SetFromWindowsErr(GetLastError()); + PyErr_SetFromWindowsErr(0); return NULL; } fs->fd = _wopen(wname, O_RDONLY | O_BINARY); From 315f1e1872ec38df3b24a65c5d765188f59c414a Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 23:41:16 +0100 Subject: [PATCH 09/16] black fixes --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e61d401ff0524..82f07fa73cad0 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -975,7 +975,7 @@ def test_no_permission(all_parsers): msg = r"\[Errno 13\]" with tm.ensure_clean() as path: - os.chmod(path, 0) # make file unreadable + os.chmod(path, 0) # make file unreadable with pytest.raises(PermissionError, match=msg) as e: parser.read_csv(path) assert path == e.value.filename From f14b501925754d535a95433e7bcba34893d3ffaf Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 20:55:10 +0100 Subject: [PATCH 10/16] Fix indentation of assert statement (also in previous test, same error) --- pandas/tests/io/parser/test_common.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 82f07fa73cad0..c4c709c8de7de 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -963,10 +963,7 @@ def test_nonexistent_path(all_parsers): msg = r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) - - filename = e.value.filename - - assert path == filename + assert path == e.value.filename def test_no_permission(all_parsers): @@ -978,7 +975,7 @@ def test_no_permission(all_parsers): os.chmod(path, 0) # make file unreadable with pytest.raises(PermissionError, match=msg) as e: parser.read_csv(path) - assert path == e.value.filename + assert path == e.value.filename def test_missing_trailing_delimiters(all_parsers): From ea402b56c4020052d673832002e1b30fa610acb2 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 21:57:07 +0100 Subject: [PATCH 11/16] Skip the test on windows --- pandas/tests/io/parser/test_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c4c709c8de7de..35e541a9939a3 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -966,6 +966,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename +@td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 parser = all_parsers From 3571de678a3a28ec0b5f60e8be6ffb98a60e0980 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 22:15:51 +0100 Subject: [PATCH 12/16] Fix black issue --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 35e541a9939a3..9de2ec9799353 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -966,7 +966,7 @@ def test_nonexistent_path(all_parsers): assert path == e.value.filename -@td.skip_if_windows # os.chmod does not work in windows +@td.skip_if_windows # os.chmod does not work in windows def test_no_permission(all_parsers): # GH 23784 parser = all_parsers From 1be646eaecb9e614ebb230d50f737239d861f67b Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 23:14:30 +0100 Subject: [PATCH 13/16] Let new_mmap fail without exception to allow fallback --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cb84c385ac5ee..92e9aaf6fe159 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -236,7 +236,7 @@ cdef extern from "parser/tokenizer.h": cdef extern from "parser/io.h": - void *new_mmap(char *fname) except NULL + void *new_mmap(char *fname) int del_mmap(void *src) void* buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) From 1e90a6efa3ab5fd4bcfcba0f47a3cb0ffc458ba5 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 23:20:44 +0100 Subject: [PATCH 14/16] Do not create a python error in new_mmap to allow the fallback to work silently --- pandas/_libs/src/parser/io.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 8640da4bcb725..51504527de5a2 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -230,18 +230,15 @@ void *new_mmap(char *fname) { mm = (memory_map *)malloc(sizeof(memory_map)); if (mm == NULL) { - PyErr_NoMemory(); return NULL; } mm->fd = open(fname, O_RDONLY | O_BINARY); if (mm->fd == -1) { - PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); free(mm); return NULL; } if (fstat(mm->fd, &stat) == -1) { - PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); close(mm->fd); free(mm); return NULL; @@ -250,7 +247,6 @@ void *new_mmap(char *fname) { mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); if (mm->memmap == MAP_FAILED) { - PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); close(mm->fd); free(mm); return NULL; From 316e2e7e5f11eb5877ee2e746ed8c58dc097e6a4 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 19 Mar 2020 00:09:25 +0100 Subject: [PATCH 15/16] Remove the NULL pointer check for new_rd_source now that it will raise an exception --- pandas/_libs/parsers.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 92e9aaf6fe159..39195585ebfa6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -673,10 +673,6 @@ cdef class TextReader: # e.g., StringIO ptr = new_rd_source(source) - if ptr == NULL: - raise IOError('Initializing parser from file-like ' - 'object failed') - self.parser.source = ptr self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source From 7371d13b948cc6d0d37c7347694fd3832ef2a2c3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 18 Mar 2020 20:34:45 -0400 Subject: [PATCH 16/16] Update doc/source/whatsnew/v1.1.0.rst Co-Authored-By: gfyoung --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 877e2d9fd35b7..399e275bb371a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -346,7 +346,7 @@ I/O - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) - Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) -- Bug in :meth:`read_csv` was raising a misleading exception on permission issue (:issue:`23784`) +- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) Plotting