From b1d27a0fca77702033da258c75586c78b66a3a42 Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Mon, 12 Jun 2017 11:35:14 +1000 Subject: [PATCH 1/7] Revert "BUG: Revert gh-16039 (#16663)" This reverts commit c550372910435bcfa8ce35d134c0a4ba761fc084. --- doc/source/whatsnew/v0.20.3.txt | 1 - pandas/_libs/src/parser/io.c | 142 ++++++++++++++++++------------- pandas/_libs/src/parser/io.h | 28 ++---- pandas/io/parsers.py | 3 +- pandas/tests/io/parser/common.py | 15 ---- 5 files changed, 91 insertions(+), 98 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index f21230693686e..52f7701724f18 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -54,7 +54,6 @@ Indexing I/O ^^^ -- Bug in ``pd.read_csv()`` in which files containing EOF characters mid-field could fail with the C engine on Windows (:issue:`16039`, :issue:`16559`) Plotting diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 4381ef19e991b..dee7d9d9281c4 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -9,33 +9,40 @@ The full license is in the LICENSE file, distributed with this software. #include "io.h" +#include +#include +#include + /* On-disk FILE, uncompressed */ void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); - fs->fp = fopen(fname, "rb"); - - if (fs->fp == NULL) { - free(fs); + if (fs == NULL) { return NULL; } - setbuf(fs->fp, NULL); - fs->initial_file_pos = ftell(fs->fp); + fs->fd = open(fname, O_RDONLY); + if (fs->fd == -1) { + goto err_free; + } // Only allocate this heap memory if we are not memory-mapping the file fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { - return NULL; + goto err_free; } - memset(fs->buffer, 0, buffer_size + 1); - fs->buffer[buffer_size] = '\0'; + memset(fs->buffer, '\0', buffer_size + 1); + fs->size = buffer_size; return (void *)fs; + +err_free: + free(fs); + return NULL; } void *new_rd_source(PyObject *obj) { @@ -56,12 +63,12 @@ void *new_rd_source(PyObject *obj) { */ -int del_file_source(void *fs) { +int del_file_source(void *ptr) { + file_source *fs = ptr; if (fs == NULL) return 0; - /* allocated on the heap */ - free(FS(fs)->buffer); - fclose(FS(fs)->fp); + free(fs->buffer); + close(fs->fd); free(fs); return 0; @@ -83,17 +90,31 @@ int del_rd_source(void *rds) { void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { - file_source *src = FS(source); + file_source *fs = FS(source); + ssize_t rv; - *bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp); + if (nbytes > fs->size) { + nbytes = fs->size; + } - if (*bytes_read == 0) { + rv = read(fs->fd, fs->buffer, nbytes); + switch (rv) { + case -1: + *status = CALLING_READ_FAILED; + *bytes_read = 0; + return NULL; + case 0: *status = REACHED_EOF; - } else { + *bytes_read = 0; + return NULL; + default: *status = 0; + *bytes_read = rv; + fs->buffer[rv] = '\0'; + break; } - return (void *)src->buffer; + return (void *)fs->buffer; } void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, @@ -152,52 +173,58 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, #ifdef HAVE_MMAP #include -#include void *new_mmap(char *fname) { - struct stat buf; - int fd; memory_map *mm; - off_t filesize; + struct stat stat; + size_t filesize; mm = (memory_map *)malloc(sizeof(memory_map)); - mm->fp = fopen(fname, "rb"); - - fd = fileno(mm->fp); - if (fstat(fd, &buf) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); - return NULL; - } - filesize = buf.st_size; /* XXX This might be 32 bits. */ - if (mm == NULL) { - /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return NULL; + return (NULL); + } + mm->fd = open(fname, O_RDONLY); + if (mm->fd == -1) { + fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", + fname, errno); + goto err_free; } - mm->size = (off_t)filesize; - mm->line_number = 0; - mm->fileno = fd; - mm->position = ftell(mm->fp); - mm->last_pos = (off_t)filesize; + if (fstat(mm->fd, &stat) == -1) { + fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", + errno); + goto err_close; + } + filesize = stat.st_size; /* XXX This might be 32 bits. */ - mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); - if (mm->memmap == NULL) { + mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); + if (mm->memmap == MAP_FAILED) { /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: mmap() failed.\n"); - free(mm); - mm = NULL; + goto err_close; } - return (void *)mm; + mm->size = (off_t)filesize; + mm->position = 0; + + return mm; + +err_close: + close(mm->fd); +err_free: + free(mm); + return NULL; } -int del_mmap(void *src) { - munmap(MM(src)->memmap, MM(src)->size); +int del_mmap(void *ptr) { + memory_map *mm = ptr; + + if (mm == NULL) return 0; - fclose(MM(src)->fp); - free(src); + munmap(mm->memmap, mm->size); + close(mm->fd); + free(mm); return 0; } @@ -205,27 +232,26 @@ int del_mmap(void *src) { void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { void *retval; - memory_map *src = MM(source); + memory_map *src = source; + size_t remaining = src->size - src->position; - if (src->position == src->last_pos) { + if (remaining == 0) { *bytes_read = 0; *status = REACHED_EOF; return NULL; } - retval = src->memmap + src->position; - - if (src->position + (off_t)nbytes > src->last_pos) { - // fewer than nbytes remaining - *bytes_read = src->last_pos - src->position; - } else { - *bytes_read = nbytes; + if (nbytes > remaining) { + nbytes = remaining; } - *status = 0; + retval = src->memmap + src->position; /* advance position in mmap data structure */ - src->position += *bytes_read; + src->position += nbytes; + + *bytes_read = nbytes; + *status = 0; return retval; } diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h index 77121e9a169c1..d22e8ddaea88d 100644 --- a/pandas/_libs/src/parser/io.h +++ b/pandas/_libs/src/parser/io.h @@ -15,19 +15,10 @@ The full license is in the LICENSE file, distributed with this software. typedef struct _file_source { /* The file being read. */ - FILE *fp; + int fd; char *buffer; - - /* file position when the file_buffer was created. */ - off_t initial_file_pos; - - /* Offset in the file of the data currently in the buffer. */ - off_t buffer_file_pos; - - /* Actual number of bytes in the current buffer. (Can be less than - * buffer_size.) */ - off_t last_pos; + size_t size; } file_source; #define FS(source) ((file_source *)source) @@ -37,20 +28,13 @@ typedef struct _file_source { #endif typedef struct _memory_map { - FILE *fp; + int fd; /* Size of the file, in bytes. */ - off_t size; - - /* file position when the file_buffer was created. */ - off_t initial_file_pos; - - int line_number; - - int fileno; - off_t position; - off_t last_pos; char *memmap; + size_t size; + + size_t position; } memory_map; #define MM(src) ((memory_map *)src) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9ec3f79e1ae70..c2d5a629b03a3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1985,8 +1985,7 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] - mode = 'r' if PY3 else 'rb' - f, handles = _get_handle(f, mode, encoding=self.encoding, + f, handles = _get_handle(f, 'r', encoding=self.encoding, compression=self.compression, memory_map=self.memory_map) self.handles.extend(handles) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 4b4f44b44c163..31d815a4bca97 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1662,21 +1662,6 @@ def test_internal_eof_byte(self): result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) - def test_internal_eof_byte_to_file(self): - # see gh-16559 - data = b'c1,c2\r\n"test \x1a test", test\r\n' - expected = pd.DataFrame([["test \x1a test", " test"]], - columns=["c1", "c2"]) - - path = '__%s__.csv' % tm.rands(10) - - with tm.ensure_clean(path) as path: - with open(path, "wb") as f: - f.write(data) - - result = self.read_csv(path) - tm.assert_frame_equal(result, expected) - def test_file_handles(self): # GH 14418 - don't close user provided file handles From 3a0c08d5ac9ecbb2c8d263ca8b9b1c1233f51be8 Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Mon, 12 Jun 2017 11:36:56 +1000 Subject: [PATCH 2/7] always treat files as binary to cope with windows and EOF. on windows, EOF can appear "in band" if the file is considered text. when moving from fread() to read(), i lost the "b" part of the mode. at the time i believed this was a nop, since unix doesnt treat files differently based on that flag. this adds O_BINARY to the flags to open to restore the behaviour lost when taking "b" away from fopen. if a platform doesn't provide O_BINARY, this defines it to 0 so it can still be used without effect later on in the code. --- pandas/_libs/src/parser/io.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index dee7d9d9281c4..9bb35a5f9eaa1 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -13,6 +13,10 @@ The full license is in the LICENSE file, distributed with this software. #include #include +#ifndef O_BINARY +#define O_BINARY 0 +#endif /* O_BINARY */ + /* On-disk FILE, uncompressed */ @@ -23,7 +27,7 @@ void *new_file_source(char *fname, size_t buffer_size) { return NULL; } - fs->fd = open(fname, O_RDONLY); + fs->fd = open(fname, O_RDONLY | O_BINARY); if (fs->fd == -1) { goto err_free; } @@ -184,7 +188,7 @@ void *new_mmap(char *fname) { fprintf(stderr, "new_file_buffer: malloc() failed.\n"); return (NULL); } - mm->fd = open(fname, O_RDONLY); + mm->fd = open(fname, O_RDONLY | O_BINARY); if (mm->fd == -1) { fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", fname, errno); From bad5795eb408d82d59730293efc63bb01c75d01d Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Mon, 12 Jun 2017 11:47:43 +1000 Subject: [PATCH 3/7] dont leak the fd in new_file_source() if buffer allocation fails. --- pandas/_libs/src/parser/io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 9bb35a5f9eaa1..3499222bf6ac5 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -36,7 +36,7 @@ void *new_file_source(char *fname, size_t buffer_size) { fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { - goto err_free; + goto err_close; } memset(fs->buffer, '\0', buffer_size + 1); @@ -44,6 +44,8 @@ void *new_file_source(char *fname, size_t buffer_size) { return (void *)fs; +err_close: + close(fs->fd); err_free: free(fs); return NULL; From 28543ff4a88e275769f684819e96df8b2b6b226e Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Mon, 12 Jun 2017 13:30:05 +1000 Subject: [PATCH 4/7] reapply the test for EOF in the middle of a stream. part of c550372910435bcfa8ce35d134c0a4ba761fc084 --- pandas/tests/io/parser/common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 31d815a4bca97..4b4f44b44c163 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1662,6 +1662,21 @@ def test_internal_eof_byte(self): result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) + def test_internal_eof_byte_to_file(self): + # see gh-16559 + data = b'c1,c2\r\n"test \x1a test", test\r\n' + expected = pd.DataFrame([["test \x1a test", " test"]], + columns=["c1", "c2"]) + + path = '__%s__.csv' % tm.rands(10) + + with tm.ensure_clean(path) as path: + with open(path, "wb") as f: + f.write(data) + + result = self.read_csv(path) + tm.assert_frame_equal(result, expected) + def test_file_handles(self): # GH 14418 - don't close user provided file handles From 0a831b60dc04dea162865c76d0aece096936ead6 Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Mon, 12 Jun 2017 13:33:35 +1000 Subject: [PATCH 5/7] pass rb to _get_handle on python 3, otherwise stick to r. part of c550372910435bcfa8ce35d134c0a4ba761fc084 --- pandas/io/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c2d5a629b03a3..9ec3f79e1ae70 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1985,7 +1985,8 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] - f, handles = _get_handle(f, 'r', encoding=self.encoding, + mode = 'r' if PY3 else 'rb' + f, handles = _get_handle(f, mode, encoding=self.encoding, compression=self.compression, memory_map=self.memory_map) self.handles.extend(handles) From 48c5a0bf185605a1ebbdc5e08cf66cec2f7a6672 Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Tue, 13 Jun 2017 11:49:56 +1000 Subject: [PATCH 6/7] replace goto with inline unwinding of state. requested by @jreback in #16675 feedback. --- pandas/_libs/src/parser/io.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 3499222bf6ac5..8300e889d4157 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -29,26 +29,23 @@ void *new_file_source(char *fname, size_t buffer_size) { fs->fd = open(fname, O_RDONLY | O_BINARY); if (fs->fd == -1) { - goto err_free; + free(fs); + return NULL; } // Only allocate this heap memory if we are not memory-mapping the file fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { - goto err_close; + close(fs->fd); + free(fs); + return NULL; } memset(fs->buffer, '\0', buffer_size + 1); fs->size = buffer_size; return (void *)fs; - -err_close: - close(fs->fd); -err_free: - free(fs); - return NULL; } void *new_rd_source(PyObject *obj) { @@ -194,13 +191,16 @@ void *new_mmap(char *fname) { if (mm->fd == -1) { fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", fname, errno); - goto err_free; + free(mm); + return NULL; } if (fstat(mm->fd, &stat) == -1) { fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); - goto err_close; + close(mm->fd); + free(mm); + return NULL; } filesize = stat.st_size; /* XXX This might be 32 bits. */ @@ -208,19 +208,15 @@ void *new_mmap(char *fname) { if (mm->memmap == MAP_FAILED) { /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: mmap() failed.\n"); - goto err_close; + close(mm->fd); + free(mm); + return NULL; } mm->size = (off_t)filesize; mm->position = 0; return mm; - -err_close: - close(mm->fd); -err_free: - free(mm); - return NULL; } int del_mmap(void *ptr) { From 28ec409da809c3a9c01c4b2b88a30f73ddd315a3 Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Tue, 13 Jun 2017 11:53:23 +1000 Subject: [PATCH 7/7] describe the fixes to the read_csv() backend and issue numbers. requested by @jreback in feedback on #16675 --- doc/source/whatsnew/v0.20.3.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index 52f7701724f18..5820e4ab38130 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -54,7 +54,7 @@ Indexing I/O ^^^ - +-- Bug in ``pd.read_csv()`` in which files weren't opened as binary files by the C engine on Windows, causing EOF characters mid-field, which would fail (:issue:`16039`, :issue:`16559`, :issue`16675`) Plotting ^^^^^^^^