From f8e53ad17284f147f72c0965021e0f08bcdf8cac Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Tue, 18 Apr 2017 11:20:01 +1000 Subject: [PATCH 1/2] PERF: move io.c from using unbuffered fread()s to read()s. pandas already buffers reads coming from io.c itself, so it previously used setbuf() to disable buffering inside fread(). however, certain implementations of unbuffered stdio reads are sub-optimal. for example, fread() in solaris ends up doing a read() for each individual byte of the underlying filedescriptor, which turns out to be very slow. instead, this code now open()s a file descritor and read()s directly into the buffer that pandas has already allocated. this is effectively what other libcs (eg, glibc) do underneath an unbuffered fread() anyway, but this is more explicit. while here, this tweaks the mmap backend to use open() too, and also properly checks for mmap failure by comparing its result to MAP_FAILED instead of NULL. closes #16039 --- pandas/_libs/src/parser/io.c | 142 +++++++++++++++++++++-------------- pandas/_libs/src/parser/io.h | 28 ++----- 2 files changed, 90 insertions(+), 80 deletions(-) diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 4381ef19e991b..dee7d9d9281c4 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -9,33 +9,40 @@ The full license is in the LICENSE file, distributed with this software. #include "io.h" +#include +#include +#include + /* On-disk FILE, uncompressed */ void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); - fs->fp = fopen(fname, "rb"); - - if (fs->fp == NULL) { - free(fs); + if (fs == NULL) { return NULL; } - setbuf(fs->fp, NULL); - fs->initial_file_pos = ftell(fs->fp); + fs->fd = open(fname, O_RDONLY); + if (fs->fd == -1) { + goto err_free; + } // Only allocate this heap memory if we are not memory-mapping the file fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char)); if (fs->buffer == NULL) { - return NULL; + goto err_free; } - memset(fs->buffer, 0, buffer_size + 1); - fs->buffer[buffer_size] = '\0'; + memset(fs->buffer, '\0', buffer_size + 1); + fs->size = buffer_size; return (void *)fs; + +err_free: + free(fs); + return NULL; } void *new_rd_source(PyObject *obj) { @@ -56,12 +63,12 @@ void *new_rd_source(PyObject *obj) { */ -int del_file_source(void *fs) { +int del_file_source(void *ptr) { + file_source *fs = ptr; if (fs == NULL) return 0; - /* allocated on the heap */ - free(FS(fs)->buffer); - fclose(FS(fs)->fp); + free(fs->buffer); + close(fs->fd); free(fs); return 0; @@ -83,17 +90,31 @@ int del_rd_source(void *rds) { void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { - file_source *src = FS(source); + file_source *fs = FS(source); + ssize_t rv; - *bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp); + if (nbytes > fs->size) { + nbytes = fs->size; + } - if (*bytes_read == 0) { + rv = read(fs->fd, fs->buffer, nbytes); + switch (rv) { + case -1: + *status = CALLING_READ_FAILED; + *bytes_read = 0; + return NULL; + case 0: *status = REACHED_EOF; - } else { + *bytes_read = 0; + return NULL; + default: *status = 0; + *bytes_read = rv; + fs->buffer[rv] = '\0'; + break; } - return (void *)src->buffer; + return (void *)fs->buffer; } void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, @@ -152,52 +173,58 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, #ifdef HAVE_MMAP #include -#include void *new_mmap(char *fname) { - struct stat buf; - int fd; memory_map *mm; - off_t filesize; + struct stat stat; + size_t filesize; mm = (memory_map *)malloc(sizeof(memory_map)); - mm->fp = fopen(fname, "rb"); - - fd = fileno(mm->fp); - if (fstat(fd, &buf) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); - return NULL; - } - filesize = buf.st_size; /* XXX This might be 32 bits. */ - if (mm == NULL) { - /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return NULL; + return (NULL); + } + mm->fd = open(fname, O_RDONLY); + if (mm->fd == -1) { + fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", + fname, errno); + goto err_free; } - mm->size = (off_t)filesize; - mm->line_number = 0; - mm->fileno = fd; - mm->position = ftell(mm->fp); - mm->last_pos = (off_t)filesize; + if (fstat(mm->fd, &stat) == -1) { + fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", + errno); + goto err_close; + } + filesize = stat.st_size; /* XXX This might be 32 bits. */ - mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); - if (mm->memmap == NULL) { + mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); + if (mm->memmap == MAP_FAILED) { /* XXX Eventually remove this print statement. */ fprintf(stderr, "new_file_buffer: mmap() failed.\n"); - free(mm); - mm = NULL; + goto err_close; } - return (void *)mm; + mm->size = (off_t)filesize; + mm->position = 0; + + return mm; + +err_close: + close(mm->fd); +err_free: + free(mm); + return NULL; } -int del_mmap(void *src) { - munmap(MM(src)->memmap, MM(src)->size); +int del_mmap(void *ptr) { + memory_map *mm = ptr; + + if (mm == NULL) return 0; - fclose(MM(src)->fp); - free(src); + munmap(mm->memmap, mm->size); + close(mm->fd); + free(mm); return 0; } @@ -205,27 +232,26 @@ int del_mmap(void *src) { void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) { void *retval; - memory_map *src = MM(source); + memory_map *src = source; + size_t remaining = src->size - src->position; - if (src->position == src->last_pos) { + if (remaining == 0) { *bytes_read = 0; *status = REACHED_EOF; return NULL; } - retval = src->memmap + src->position; - - if (src->position + (off_t)nbytes > src->last_pos) { - // fewer than nbytes remaining - *bytes_read = src->last_pos - src->position; - } else { - *bytes_read = nbytes; + if (nbytes > remaining) { + nbytes = remaining; } - *status = 0; + retval = src->memmap + src->position; /* advance position in mmap data structure */ - src->position += *bytes_read; + src->position += nbytes; + + *bytes_read = nbytes; + *status = 0; return retval; } diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h index 77121e9a169c1..d22e8ddaea88d 100644 --- a/pandas/_libs/src/parser/io.h +++ b/pandas/_libs/src/parser/io.h @@ -15,19 +15,10 @@ The full license is in the LICENSE file, distributed with this software. typedef struct _file_source { /* The file being read. */ - FILE *fp; + int fd; char *buffer; - - /* file position when the file_buffer was created. */ - off_t initial_file_pos; - - /* Offset in the file of the data currently in the buffer. */ - off_t buffer_file_pos; - - /* Actual number of bytes in the current buffer. (Can be less than - * buffer_size.) */ - off_t last_pos; + size_t size; } file_source; #define FS(source) ((file_source *)source) @@ -37,20 +28,13 @@ typedef struct _file_source { #endif typedef struct _memory_map { - FILE *fp; + int fd; /* Size of the file, in bytes. */ - off_t size; - - /* file position when the file_buffer was created. */ - off_t initial_file_pos; - - int line_number; - - int fileno; - off_t position; - off_t last_pos; char *memmap; + size_t size; + + size_t position; } memory_map; #define MM(src) ((memory_map *)src) From b29b032b344c782a0efb968790025c14df25769e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 20 Apr 2017 06:23:38 -0400 Subject: [PATCH 2/2] DOC: add whatsnew --- doc/source/whatsnew/v0.20.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6110454be7da8..e43dbf85db9dc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1537,6 +1537,7 @@ Performance Improvements function used the ``.name`` attribute of the group DataFrame (:issue:`15062`). - Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`). - Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`) +- Improved performance in ``pd.read_csv()`` on some platforms with buffered reads (:issue:`16039`) .. _whatsnew_0200.bug_fixes: