Skip to content

BUG: csv parsing with EOF byte on windows, Revert gh-16039 #16663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Indexing
I/O
^^^

- Bug in ``pd.read_csv()`` in which files containing EOF characters mid-field could fail with the C engine on Windows (:issue:`16039`, :issue:`16559`)


Plotting
Expand Down
142 changes: 58 additions & 84 deletions pandas/_libs/src/parser/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,40 +9,33 @@ The full license is in the LICENSE file, distributed with this software.

#include "io.h"

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

/*
On-disk FILE, uncompressed
*/

void *new_file_source(char *fname, size_t buffer_size) {
file_source *fs = (file_source *)malloc(sizeof(file_source));
if (fs == NULL) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fs can still be a NULL pointer no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it can be on some architectures. We should protect against that.

fs->fp = fopen(fname, "rb");

if (fs->fp == NULL) {
free(fs);
return NULL;
}
setbuf(fs->fp, NULL);

fs->fd = open(fname, O_RDONLY);
if (fs->fd == -1) {
goto err_free;
}
fs->initial_file_pos = ftell(fs->fp);

// Only allocate this heap memory if we are not memory-mapping the file
fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char));

if (fs->buffer == NULL) {
goto err_free;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't you need to free fs here? (what about closing the fh)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, those would both be good to do here, if possible.

return NULL;
}

memset(fs->buffer, '\0', buffer_size + 1);
fs->size = buffer_size;
memset(fs->buffer, 0, buffer_size + 1);
fs->buffer[buffer_size] = '\0';

return (void *)fs;

err_free:
free(fs);
return NULL;
}

void *new_rd_source(PyObject *obj) {
Expand All @@ -63,12 +56,12 @@ void *new_rd_source(PyObject *obj) {

*/

int del_file_source(void *ptr) {
file_source *fs = ptr;
int del_file_source(void *fs) {
if (fs == NULL) return 0;

free(fs->buffer);
close(fs->fd);
/* allocated on the heap */
free(FS(fs)->buffer);
fclose(FS(fs)->fp);
free(fs);

return 0;
Expand All @@ -90,31 +83,17 @@ int del_rd_source(void *rds) {

void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status) {
file_source *fs = FS(source);
ssize_t rv;
file_source *src = FS(source);

if (nbytes > fs->size) {
nbytes = fs->size;
}
*bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this only read min(bytes left, nbytes)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that is correct.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, fread() will put up to sizeof(char) * nbytes into the buffer at src->buffer. if nbytes is greater than fs->size, then this will overflow. the rest of pandas may limit nbytes to fs->size before passing it to buffer_file_bytes(), but id argue that makes reading or auditing this code harder.

Copy link
Member Author

@gfyoung gfyoung Jun 12, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure we'll hit end-of-file first, and it will terminate then, no?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the file or the remainder of the file is shorter than fs->size, yes. that isn't the only situation buffer_file_bytes() is called in though.

say you have a 1MB file, and new_file_source() is called with 1000 as the buffer_size argument, then fs->size will be 1000 bytes long. if buffer_file_bytes() is called with nbytes as 2000, then you will read 2000 bytes into a 1000 byte long buffer because there at least that much data available in the file.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, got it. I'll add that to my follow-up PR then.


rv = read(fs->fd, fs->buffer, nbytes);
switch (rv) {
case -1:
*status = CALLING_READ_FAILED;
*bytes_read = 0;
return NULL;
case 0:
if (*bytes_read == 0) {
*status = REACHED_EOF;
*bytes_read = 0;
return NULL;
default:
} else {
*status = 0;
*bytes_read = rv;
fs->buffer[rv] = '\0';
break;
}

return (void *)fs->buffer;
return (void *)src->buffer;
}

void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
Expand Down Expand Up @@ -173,86 +152,81 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
#ifdef HAVE_MMAP

#include <sys/mman.h>
#include <sys/stat.h>

void *new_mmap(char *fname) {
struct stat buf;
int fd;
memory_map *mm;
struct stat stat;
size_t filesize;
off_t filesize;

mm = (memory_map *)malloc(sizeof(memory_map));
mm->fp = fopen(fname, "rb");

fd = fileno(mm->fp);
if (fstat(fd, &buf) == -1) {
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno);
return NULL;
}
filesize = buf.st_size; /* XXX This might be 32 bits. */

if (mm == NULL) {
/* XXX Eventually remove this print statement. */
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
return (NULL);
}
mm->fd = open(fname, O_RDONLY);
if (mm->fd == -1) {
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
fname, errno);
goto err_free;
return NULL;
}
mm->size = (off_t)filesize;
mm->line_number = 0;

if (fstat(mm->fd, &stat) == -1) {
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
errno);
goto err_close;
}
filesize = stat.st_size; /* XXX This might be 32 bits. */
mm->fileno = fd;
mm->position = ftell(mm->fp);
mm->last_pos = (off_t)filesize;

mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
if (mm->memmap == MAP_FAILED) {
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
if (mm->memmap == NULL) {
/* XXX Eventually remove this print statement. */
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
goto err_close;
free(mm);
mm = NULL;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't just return NULL here?

Copy link
Member Author

@gfyoung gfyoung Jun 12, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you can just return NULL here IINM.

}

mm->size = (off_t)filesize;
mm->position = 0;

return mm;

err_close:
close(mm->fd);
err_free:
free(mm);
return NULL;
return (void *)mm;
}

int del_mmap(void *ptr) {
memory_map *mm = ptr;

if (mm == NULL) return 0;
int del_mmap(void *src) {
munmap(MM(src)->memmap, MM(src)->size);

munmap(mm->memmap, mm->size);
close(mm->fd);
free(mm);
fclose(MM(src)->fp);
free(src);

return 0;
}

void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status) {
void *retval;
memory_map *src = source;
size_t remaining = src->size - src->position;
memory_map *src = MM(source);

if (remaining == 0) {
if (src->position == src->last_pos) {
*bytes_read = 0;
*status = REACHED_EOF;
return NULL;
}

if (nbytes > remaining) {
nbytes = remaining;
}

retval = src->memmap + src->position;

/* advance position in mmap data structure */
src->position += nbytes;
if (src->position + (off_t)nbytes > src->last_pos) {
// fewer than nbytes remaining
*bytes_read = src->last_pos - src->position;
} else {
*bytes_read = nbytes;
}

*bytes_read = nbytes;
*status = 0;

/* advance position in mmap data structure */
src->position += *bytes_read;

return retval;
}

Expand Down
28 changes: 22 additions & 6 deletions pandas/_libs/src/parser/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,19 @@ The full license is in the LICENSE file, distributed with this software.

typedef struct _file_source {
/* The file being read. */
int fd;
FILE *fp;

char *buffer;
size_t size;

/* file position when the file_buffer was created. */
off_t initial_file_pos;

/* Offset in the file of the data currently in the buffer. */
off_t buffer_file_pos;

/* Actual number of bytes in the current buffer. (Can be less than
* buffer_size.) */
off_t last_pos;
} file_source;

#define FS(source) ((file_source *)source)
Expand All @@ -28,13 +37,20 @@ typedef struct _file_source {
#endif

typedef struct _memory_map {
int fd;
FILE *fp;

/* Size of the file, in bytes. */
char *memmap;
size_t size;
off_t size;

size_t position;
/* file position when the file_buffer was created. */
off_t initial_file_pos;

int line_number;

int fileno;
off_t position;
off_t last_pos;
char *memmap;
} memory_map;

#define MM(src) ((memory_map *)src)
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1985,7 +1985,8 @@ def __init__(self, f, **kwds):
self.comment = kwds['comment']
self._comment_lines = []

f, handles = _get_handle(f, 'r', encoding=self.encoding,
mode = 'r' if PY3 else 'rb'
f, handles = _get_handle(f, mode, encoding=self.encoding,
compression=self.compression,
memory_map=self.memory_map)
self.handles.extend(handles)
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1662,6 +1662,21 @@ def test_internal_eof_byte(self):
result = self.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)

def test_internal_eof_byte_to_file(self):
# see gh-16559
data = b'c1,c2\r\n"test \x1a test", test\r\n'
expected = pd.DataFrame([["test \x1a test", " test"]],
columns=["c1", "c2"])

path = '__%s__.csv' % tm.rands(10)

with tm.ensure_clean(path) as path:
with open(path, "wb") as f:
f.write(data)

result = self.read_csv(path)
tm.assert_frame_equal(result, expected)

def test_file_handles(self):
# GH 14418 - don't close user provided file handles

Expand Down