Skip to content

Commit ff00b55

Browse files
dgwynnejreback
authored andcommitted
move io.c from using unbuffered fread()s to read()s. (#16039)
* PERF: move io.c from using unbuffered fread()s to read()s. pandas already buffers reads coming from io.c itself, so it previously used setbuf() to disable buffering inside fread(). however, certain implementations of unbuffered stdio reads are sub-optimal. for example, fread() in solaris ends up doing a read() for each individual byte of the underlying filedescriptor, which turns out to be very slow. instead, this code now open()s a file descritor and read()s directly into the buffer that pandas has already allocated. this is effectively what other libcs (eg, glibc) do underneath an unbuffered fread() anyway, but this is more explicit. while here, this tweaks the mmap backend to use open() too, and also properly checks for mmap failure by comparing its result to MAP_FAILED instead of NULL. closes #16039 * DOC: add whatsnew
1 parent 24b4bc6 commit ff00b55

File tree

3 files changed

+91
-80
lines changed

3 files changed

+91
-80
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1537,6 +1537,7 @@ Performance Improvements
15371537
function used the ``.name`` attribute of the group DataFrame (:issue:`15062`).
15381538
- Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`).
15391539
- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`)
1540+
- Improved performance in ``pd.read_csv()`` on some platforms with buffered reads (:issue:`16039`)
15401541

15411542
.. _whatsnew_0200.bug_fixes:
15421543

pandas/_libs/src/parser/io.c

+84-58
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,40 @@ The full license is in the LICENSE file, distributed with this software.
99

1010
#include "io.h"
1111

12+
#include <sys/types.h>
13+
#include <sys/stat.h>
14+
#include <fcntl.h>
15+
1216
/*
1317
On-disk FILE, uncompressed
1418
*/
1519

1620
void *new_file_source(char *fname, size_t buffer_size) {
1721
file_source *fs = (file_source *)malloc(sizeof(file_source));
18-
fs->fp = fopen(fname, "rb");
19-
20-
if (fs->fp == NULL) {
21-
free(fs);
22+
if (fs == NULL) {
2223
return NULL;
2324
}
24-
setbuf(fs->fp, NULL);
2525

26-
fs->initial_file_pos = ftell(fs->fp);
26+
fs->fd = open(fname, O_RDONLY);
27+
if (fs->fd == -1) {
28+
goto err_free;
29+
}
2730

2831
// Only allocate this heap memory if we are not memory-mapping the file
2932
fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char));
3033

3134
if (fs->buffer == NULL) {
32-
return NULL;
35+
goto err_free;
3336
}
3437

35-
memset(fs->buffer, 0, buffer_size + 1);
36-
fs->buffer[buffer_size] = '\0';
38+
memset(fs->buffer, '\0', buffer_size + 1);
39+
fs->size = buffer_size;
3740

3841
return (void *)fs;
42+
43+
err_free:
44+
free(fs);
45+
return NULL;
3946
}
4047

4148
void *new_rd_source(PyObject *obj) {
@@ -56,12 +63,12 @@ void *new_rd_source(PyObject *obj) {
5663
5764
*/
5865

59-
int del_file_source(void *fs) {
66+
int del_file_source(void *ptr) {
67+
file_source *fs = ptr;
6068
if (fs == NULL) return 0;
6169

62-
/* allocated on the heap */
63-
free(FS(fs)->buffer);
64-
fclose(FS(fs)->fp);
70+
free(fs->buffer);
71+
close(fs->fd);
6572
free(fs);
6673

6774
return 0;
@@ -83,17 +90,31 @@ int del_rd_source(void *rds) {
8390

8491
void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
8592
int *status) {
86-
file_source *src = FS(source);
93+
file_source *fs = FS(source);
94+
ssize_t rv;
8795

88-
*bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp);
96+
if (nbytes > fs->size) {
97+
nbytes = fs->size;
98+
}
8999

90-
if (*bytes_read == 0) {
100+
rv = read(fs->fd, fs->buffer, nbytes);
101+
switch (rv) {
102+
case -1:
103+
*status = CALLING_READ_FAILED;
104+
*bytes_read = 0;
105+
return NULL;
106+
case 0:
91107
*status = REACHED_EOF;
92-
} else {
108+
*bytes_read = 0;
109+
return NULL;
110+
default:
93111
*status = 0;
112+
*bytes_read = rv;
113+
fs->buffer[rv] = '\0';
114+
break;
94115
}
95116

96-
return (void *)src->buffer;
117+
return (void *)fs->buffer;
97118
}
98119

99120
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
@@ -152,80 +173,85 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
152173
#ifdef HAVE_MMAP
153174

154175
#include <sys/mman.h>
155-
#include <sys/stat.h>
156176

157177
void *new_mmap(char *fname) {
158-
struct stat buf;
159-
int fd;
160178
memory_map *mm;
161-
off_t filesize;
179+
struct stat stat;
180+
size_t filesize;
162181

163182
mm = (memory_map *)malloc(sizeof(memory_map));
164-
mm->fp = fopen(fname, "rb");
165-
166-
fd = fileno(mm->fp);
167-
if (fstat(fd, &buf) == -1) {
168-
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno);
169-
return NULL;
170-
}
171-
filesize = buf.st_size; /* XXX This might be 32 bits. */
172-
173183
if (mm == NULL) {
174-
/* XXX Eventually remove this print statement. */
175184
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
176-
return NULL;
185+
return (NULL);
186+
}
187+
mm->fd = open(fname, O_RDONLY);
188+
if (mm->fd == -1) {
189+
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
190+
fname, errno);
191+
goto err_free;
177192
}
178-
mm->size = (off_t)filesize;
179-
mm->line_number = 0;
180193

181-
mm->fileno = fd;
182-
mm->position = ftell(mm->fp);
183-
mm->last_pos = (off_t)filesize;
194+
if (fstat(mm->fd, &stat) == -1) {
195+
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
196+
errno);
197+
goto err_close;
198+
}
199+
filesize = stat.st_size; /* XXX This might be 32 bits. */
184200

185-
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
186-
if (mm->memmap == NULL) {
201+
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
202+
if (mm->memmap == MAP_FAILED) {
187203
/* XXX Eventually remove this print statement. */
188204
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
189-
free(mm);
190-
mm = NULL;
205+
goto err_close;
191206
}
192207

193-
return (void *)mm;
208+
mm->size = (off_t)filesize;
209+
mm->position = 0;
210+
211+
return mm;
212+
213+
err_close:
214+
close(mm->fd);
215+
err_free:
216+
free(mm);
217+
return NULL;
194218
}
195219

196-
int del_mmap(void *src) {
197-
munmap(MM(src)->memmap, MM(src)->size);
220+
int del_mmap(void *ptr) {
221+
memory_map *mm = ptr;
222+
223+
if (mm == NULL) return 0;
198224

199-
fclose(MM(src)->fp);
200-
free(src);
225+
munmap(mm->memmap, mm->size);
226+
close(mm->fd);
227+
free(mm);
201228

202229
return 0;
203230
}
204231

205232
void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
206233
int *status) {
207234
void *retval;
208-
memory_map *src = MM(source);
235+
memory_map *src = source;
236+
size_t remaining = src->size - src->position;
209237

210-
if (src->position == src->last_pos) {
238+
if (remaining == 0) {
211239
*bytes_read = 0;
212240
*status = REACHED_EOF;
213241
return NULL;
214242
}
215243

216-
retval = src->memmap + src->position;
217-
218-
if (src->position + (off_t)nbytes > src->last_pos) {
219-
// fewer than nbytes remaining
220-
*bytes_read = src->last_pos - src->position;
221-
} else {
222-
*bytes_read = nbytes;
244+
if (nbytes > remaining) {
245+
nbytes = remaining;
223246
}
224247

225-
*status = 0;
248+
retval = src->memmap + src->position;
226249

227250
/* advance position in mmap data structure */
228-
src->position += *bytes_read;
251+
src->position += nbytes;
252+
253+
*bytes_read = nbytes;
254+
*status = 0;
229255

230256
return retval;
231257
}

pandas/_libs/src/parser/io.h

+6-22
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,10 @@ The full license is in the LICENSE file, distributed with this software.
1515

1616
typedef struct _file_source {
1717
/* The file being read. */
18-
FILE *fp;
18+
int fd;
1919

2020
char *buffer;
21-
22-
/* file position when the file_buffer was created. */
23-
off_t initial_file_pos;
24-
25-
/* Offset in the file of the data currently in the buffer. */
26-
off_t buffer_file_pos;
27-
28-
/* Actual number of bytes in the current buffer. (Can be less than
29-
* buffer_size.) */
30-
off_t last_pos;
21+
size_t size;
3122
} file_source;
3223

3324
#define FS(source) ((file_source *)source)
@@ -37,20 +28,13 @@ typedef struct _file_source {
3728
#endif
3829

3930
typedef struct _memory_map {
40-
FILE *fp;
31+
int fd;
4132

4233
/* Size of the file, in bytes. */
43-
off_t size;
44-
45-
/* file position when the file_buffer was created. */
46-
off_t initial_file_pos;
47-
48-
int line_number;
49-
50-
int fileno;
51-
off_t position;
52-
off_t last_pos;
5334
char *memmap;
35+
size_t size;
36+
37+
size_t position;
5438
} memory_map;
5539

5640
#define MM(src) ((memory_map *)src)

0 commit comments

Comments
 (0)