From e56bbf48fef0c0702f3d9c55be39619c57deac73 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 27 Aug 2014 10:50:26 -0400 Subject: [PATCH 1/2] Made line comments work with whitespace delim and custom line terminator --- pandas/io/tests/test_parsers.py | 8 ++++++++ pandas/src/parser/tokenizer.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index fd1febc37caac..14e69179f9ff4 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2944,6 +2944,14 @@ def test_line_comment(self): [5., np.nan, 10.]] df = self.read_csv(StringIO(data), comment='#') tm.assert_almost_equal(df.values, expected) + # check with delim_whitespace=True + df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', + delim_whitespace=True) + tm.assert_almost_equal(df.values, expected) + # check with custom line terminator + df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', + lineterminator='*') + tm.assert_almost_equal(df.values, expected) def test_comment_skiprows(self): data = """# empty diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 1e9576487b9ed..b30706f85894b 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -969,6 +969,10 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) END_LINE(); break; } + else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; + break; + } /* normal character - handle as START_FIELD */ self->state = START_FIELD; /* fallthru */ @@ -1103,6 +1107,13 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) } break; + case EAT_LINE_COMMENT: + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; + } + break; + case EAT_COMMENT: if (c == self->lineterminator) { END_LINE(); @@ -1186,6 +1197,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) } else if (IS_WHITESPACE(c)) { self->state = EAT_WHITESPACE; break; + } else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; + break; } else { /* normal character - handle as START_FIELD */ self->state = START_FIELD; @@ -1231,6 +1245,16 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) } break; + case EAT_LINE_COMMENT: + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; + } else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; + case ESCAPED_CHAR: /* if (c == '\0') */ /* c = '\n'; */ @@ -1351,6 +1375,15 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) } break; + case EAT_CRNL_NOP: // inside an ignored comment line + self->state = START_RECORD; + /* \r line terminator -- parse this character again */ + if (c != '\n' && c != self->delimiter) { + --i; + --buf; + } + break; + case EAT_COMMENT: if (c == '\n') { END_LINE(); From 9a877dd1da5fac8eb5a2169993cd18033e2e2cfa Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 28 Aug 2014 09:22:03 -0400 Subject: [PATCH 2/2] Added a release note --- doc/source/v0.15.0.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 95bf2918f8992..21e4e0e87473f 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -638,6 +638,8 @@ Bug Fixes - Bug in ``Float64Index`` where ``iat`` and ``at`` were not testing and were failing (:issue:`8092`). +- Bug in ``read_csv`` where line comments were not handled correctly given + a custom line terminator or ``delim_whitespace=True`` (:issue:`8122`).