Skip to content

Commit c550372

Browse files
gfyoungjreback
authored andcommitted
BUG: Revert gh-16039 (#16663)
gh-16039 created a bug in which files containing byte-like data could break, as EOF characters mid-field (despite being quoted) would cause premature line breaks. Given that this PR was a performance patch, this commit can be safely reverted. Closes gh-16559.
1 parent d915c7e commit c550372

File tree

5 files changed

+98
-91
lines changed

5 files changed

+98
-91
lines changed

doc/source/whatsnew/v0.20.3.txt

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ Indexing
5454
I/O
5555
^^^
5656

57+
- Bug in ``pd.read_csv()`` in which files containing EOF characters mid-field could fail with the C engine on Windows (:issue:`16039`, :issue:`16559`)
5758

5859

5960
Plotting

pandas/_libs/src/parser/io.c

+58-84
Original file line numberDiff line numberDiff line change
@@ -9,40 +9,33 @@ The full license is in the LICENSE file, distributed with this software.
99

1010
#include "io.h"
1111

12-
#include <sys/types.h>
13-
#include <sys/stat.h>
14-
#include <fcntl.h>
15-
1612
/*
1713
On-disk FILE, uncompressed
1814
*/
1915

2016
void *new_file_source(char *fname, size_t buffer_size) {
2117
file_source *fs = (file_source *)malloc(sizeof(file_source));
22-
if (fs == NULL) {
18+
fs->fp = fopen(fname, "rb");
19+
20+
if (fs->fp == NULL) {
21+
free(fs);
2322
return NULL;
2423
}
24+
setbuf(fs->fp, NULL);
2525

26-
fs->fd = open(fname, O_RDONLY);
27-
if (fs->fd == -1) {
28-
goto err_free;
29-
}
26+
fs->initial_file_pos = ftell(fs->fp);
3027

3128
// Only allocate this heap memory if we are not memory-mapping the file
3229
fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char));
3330

3431
if (fs->buffer == NULL) {
35-
goto err_free;
32+
return NULL;
3633
}
3734

38-
memset(fs->buffer, '\0', buffer_size + 1);
39-
fs->size = buffer_size;
35+
memset(fs->buffer, 0, buffer_size + 1);
36+
fs->buffer[buffer_size] = '\0';
4037

4138
return (void *)fs;
42-
43-
err_free:
44-
free(fs);
45-
return NULL;
4639
}
4740

4841
void *new_rd_source(PyObject *obj) {
@@ -63,12 +56,12 @@ void *new_rd_source(PyObject *obj) {
6356
6457
*/
6558

66-
int del_file_source(void *ptr) {
67-
file_source *fs = ptr;
59+
int del_file_source(void *fs) {
6860
if (fs == NULL) return 0;
6961

70-
free(fs->buffer);
71-
close(fs->fd);
62+
/* allocated on the heap */
63+
free(FS(fs)->buffer);
64+
fclose(FS(fs)->fp);
7265
free(fs);
7366

7467
return 0;
@@ -90,31 +83,17 @@ int del_rd_source(void *rds) {
9083

9184
void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
9285
int *status) {
93-
file_source *fs = FS(source);
94-
ssize_t rv;
86+
file_source *src = FS(source);
9587

96-
if (nbytes > fs->size) {
97-
nbytes = fs->size;
98-
}
88+
*bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp);
9989

100-
rv = read(fs->fd, fs->buffer, nbytes);
101-
switch (rv) {
102-
case -1:
103-
*status = CALLING_READ_FAILED;
104-
*bytes_read = 0;
105-
return NULL;
106-
case 0:
90+
if (*bytes_read == 0) {
10791
*status = REACHED_EOF;
108-
*bytes_read = 0;
109-
return NULL;
110-
default:
92+
} else {
11193
*status = 0;
112-
*bytes_read = rv;
113-
fs->buffer[rv] = '\0';
114-
break;
11594
}
11695

117-
return (void *)fs->buffer;
96+
return (void *)src->buffer;
11897
}
11998

12099
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
@@ -173,86 +152,81 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
173152
#ifdef HAVE_MMAP
174153

175154
#include <sys/mman.h>
155+
#include <sys/stat.h>
176156

177157
void *new_mmap(char *fname) {
158+
struct stat buf;
159+
int fd;
178160
memory_map *mm;
179-
struct stat stat;
180-
size_t filesize;
161+
off_t filesize;
181162

182163
mm = (memory_map *)malloc(sizeof(memory_map));
164+
mm->fp = fopen(fname, "rb");
165+
166+
fd = fileno(mm->fp);
167+
if (fstat(fd, &buf) == -1) {
168+
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno);
169+
return NULL;
170+
}
171+
filesize = buf.st_size; /* XXX This might be 32 bits. */
172+
183173
if (mm == NULL) {
174+
/* XXX Eventually remove this print statement. */
184175
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
185-
return (NULL);
186-
}
187-
mm->fd = open(fname, O_RDONLY);
188-
if (mm->fd == -1) {
189-
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
190-
fname, errno);
191-
goto err_free;
176+
return NULL;
192177
}
178+
mm->size = (off_t)filesize;
179+
mm->line_number = 0;
193180

194-
if (fstat(mm->fd, &stat) == -1) {
195-
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
196-
errno);
197-
goto err_close;
198-
}
199-
filesize = stat.st_size; /* XXX This might be 32 bits. */
181+
mm->fileno = fd;
182+
mm->position = ftell(mm->fp);
183+
mm->last_pos = (off_t)filesize;
200184

201-
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
202-
if (mm->memmap == MAP_FAILED) {
185+
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
186+
if (mm->memmap == NULL) {
203187
/* XXX Eventually remove this print statement. */
204188
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
205-
goto err_close;
189+
free(mm);
190+
mm = NULL;
206191
}
207192

208-
mm->size = (off_t)filesize;
209-
mm->position = 0;
210-
211-
return mm;
212-
213-
err_close:
214-
close(mm->fd);
215-
err_free:
216-
free(mm);
217-
return NULL;
193+
return (void *)mm;
218194
}
219195

220-
int del_mmap(void *ptr) {
221-
memory_map *mm = ptr;
222-
223-
if (mm == NULL) return 0;
196+
int del_mmap(void *src) {
197+
munmap(MM(src)->memmap, MM(src)->size);
224198

225-
munmap(mm->memmap, mm->size);
226-
close(mm->fd);
227-
free(mm);
199+
fclose(MM(src)->fp);
200+
free(src);
228201

229202
return 0;
230203
}
231204

232205
void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
233206
int *status) {
234207
void *retval;
235-
memory_map *src = source;
236-
size_t remaining = src->size - src->position;
208+
memory_map *src = MM(source);
237209

238-
if (remaining == 0) {
210+
if (src->position == src->last_pos) {
239211
*bytes_read = 0;
240212
*status = REACHED_EOF;
241213
return NULL;
242214
}
243215

244-
if (nbytes > remaining) {
245-
nbytes = remaining;
246-
}
247-
248216
retval = src->memmap + src->position;
249217

250-
/* advance position in mmap data structure */
251-
src->position += nbytes;
218+
if (src->position + (off_t)nbytes > src->last_pos) {
219+
// fewer than nbytes remaining
220+
*bytes_read = src->last_pos - src->position;
221+
} else {
222+
*bytes_read = nbytes;
223+
}
252224

253-
*bytes_read = nbytes;
254225
*status = 0;
255226

227+
/* advance position in mmap data structure */
228+
src->position += *bytes_read;
229+
256230
return retval;
257231
}
258232

pandas/_libs/src/parser/io.h

+22-6
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,19 @@ The full license is in the LICENSE file, distributed with this software.
1515

1616
typedef struct _file_source {
1717
/* The file being read. */
18-
int fd;
18+
FILE *fp;
1919

2020
char *buffer;
21-
size_t size;
21+
22+
/* file position when the file_buffer was created. */
23+
off_t initial_file_pos;
24+
25+
/* Offset in the file of the data currently in the buffer. */
26+
off_t buffer_file_pos;
27+
28+
/* Actual number of bytes in the current buffer. (Can be less than
29+
* buffer_size.) */
30+
off_t last_pos;
2231
} file_source;
2332

2433
#define FS(source) ((file_source *)source)
@@ -28,13 +37,20 @@ typedef struct _file_source {
2837
#endif
2938

3039
typedef struct _memory_map {
31-
int fd;
40+
FILE *fp;
3241

3342
/* Size of the file, in bytes. */
34-
char *memmap;
35-
size_t size;
43+
off_t size;
3644

37-
size_t position;
45+
/* file position when the file_buffer was created. */
46+
off_t initial_file_pos;
47+
48+
int line_number;
49+
50+
int fileno;
51+
off_t position;
52+
off_t last_pos;
53+
char *memmap;
3854
} memory_map;
3955

4056
#define MM(src) ((memory_map *)src)

pandas/io/parsers.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1985,7 +1985,8 @@ def __init__(self, f, **kwds):
19851985
self.comment = kwds['comment']
19861986
self._comment_lines = []
19871987

1988-
f, handles = _get_handle(f, 'r', encoding=self.encoding,
1988+
mode = 'r' if PY3 else 'rb'
1989+
f, handles = _get_handle(f, mode, encoding=self.encoding,
19891990
compression=self.compression,
19901991
memory_map=self.memory_map)
19911992
self.handles.extend(handles)

pandas/tests/io/parser/common.py

+15
Original file line numberDiff line numberDiff line change
@@ -1662,6 +1662,21 @@ def test_internal_eof_byte(self):
16621662
result = self.read_csv(StringIO(data))
16631663
tm.assert_frame_equal(result, expected)
16641664

1665+
def test_internal_eof_byte_to_file(self):
1666+
# see gh-16559
1667+
data = b'c1,c2\r\n"test \x1a test", test\r\n'
1668+
expected = pd.DataFrame([["test \x1a test", " test"]],
1669+
columns=["c1", "c2"])
1670+
1671+
path = '__%s__.csv' % tm.rands(10)
1672+
1673+
with tm.ensure_clean(path) as path:
1674+
with open(path, "wb") as f:
1675+
f.write(data)
1676+
1677+
result = self.read_csv(path)
1678+
tm.assert_frame_equal(result, expected)
1679+
16651680
def test_file_handles(self):
16661681
# GH 14418 - don't close user provided file handles
16671682

0 commit comments

Comments
 (0)