From e4fb9ed16442beace0f0b431a1e198799dc008bb Mon Sep 17 00:00:00 2001 From: Jesse Johnson Date: Mon, 5 Aug 2013 15:22:38 -0400 Subject: [PATCH 1/2] ENH/BUG: ignore line comments in CSV files GH2685 * also fix bug in CSV format sniffer --- pandas/io/parsers.py | 42 ++++++++++++++++++++++++++--------- pandas/src/parser/tokenizer.c | 5 ++--- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3b132be800cb1..a620363a4ae17 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -991,7 +991,6 @@ def __init__(self, src, **kwds): self._name_processed = True (index_names, self.names, self.index_col) = _clean_index_names(self.names, self.index_col) - if self.index_names is None: self.index_names = index_names @@ -1100,7 +1099,6 @@ def _get_index_names(self): if self._reader.leading_cols == 0 and self.index_col is not None: (idx_names, names, self.index_col) = _clean_index_names(names, self.index_col) - return names, idx_names def _maybe_parse_dates(self, values, index, try_parse_dates=True): @@ -1282,9 +1280,8 @@ class MyDialect(csv.Dialect): sniff_sep = True - if sep is not None: + if (sep is not None) and (dia.quotechar is not None): sniff_sep = False - dia.delimiter = sep # attempt to sniff the delimiter if sniff_sep: line = f.readline() @@ -1292,11 +1289,21 @@ class MyDialect(csv.Dialect): self.pos += 1 line = f.readline() - line = self._check_comments([line])[0] + line = self._check_comments([[line]]) + + while not line: + self.pos += 1 + line = f.readline() + line = self._check_comments([[line]]) + + line = line[0][0] self.pos += 1 sniffed = csv.Sniffer().sniff(line) - dia.delimiter = sniffed.delimiter + if not dia.delimiter: + dia.delimiter = sniffed.delimiter + if not dia.quotechar: + dia.quotechar = sniffed.quotechar if self.encoding is not None: self.buf.extend(list( com.UnicodeReader(StringIO(line), @@ -1466,14 +1473,26 @@ def _next_line(self): line = self.data[self.pos] except IndexError: raise StopIteration + + line = self._check_comments([line]) + + while not line: + self.pos += 1 + try: + line = self.data[self.pos] + except IndexError: + raise StopIteration + line = self._check_comments([line]) + + line = line[0] else: while self.pos in self.skiprows: next(self.data) self.pos += 1 line = next(self.data) + line = self._check_comments([line])[0] - line = self._check_comments([line])[0] line = self._check_thousands([line])[0] self.pos += 1 @@ -1496,7 +1515,10 @@ def _check_comments(self, lines): if len(x) > 0: rl.append(x) break - ret.append(rl) + if rl: + ret.append(rl) + if not ret: + ret = [[]]; return ret def _check_thousands(self, lines): @@ -1524,7 +1546,7 @@ def _clear_buffer(self): def _get_index_name(self, columns): orig_names = list(columns) columns = list(columns) - + try: line = self._next_line() except StopIteration: @@ -1539,7 +1561,7 @@ def _get_index_name(self, columns): # implicitly index_col=0 b/c 1 fewer column names implicit_first_cols = 0 - if line is not None: + if line and (line is not None): # leave it 0, #2442 if self.index_col is not False: implicit_first_cols = len(line) - len(columns) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index cad5d98dde53a..dc784be43e4bd 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -823,7 +823,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit) } else if (c == self->delimiter) { // End of field. End of line not reached yet - END_FIELD(); self->state = START_FIELD; } @@ -866,7 +865,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) } else { /* \r line terminator */ - /* UGH. we don't actually want to consume the token. fix this later */ + /*FIXME UGH. we don't actually want to consume the token. */ self->stream_len = slen; if (end_line(self) < 0) { goto parsingerror; @@ -875,7 +874,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) slen = self->stream_len; self->state = START_RECORD; - /* HACK, let's try this one again */ + /*FIXME let's try this one again */ --i; buf--; if (line_limit > 0 && self->lines == start_lines + line_limit) { goto linelimit; From d680f13fd41fcd8c42a5597b93be924eec0e5153 Mon Sep 17 00:00:00 2001 From: Jesse Johnson Date: Mon, 12 Aug 2013 14:41:43 -0400 Subject: [PATCH 2/2] TST: add test for CSV parser line comments --- pandas/io/tests/test_parsers.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 41345352b5ec5..d10feb7ac7c98 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1527,17 +1527,29 @@ def test_multiple_date_col_multiple_index(self): def test_comment(self): data = """A,B,C -1,2.,4.#hello world -5.,NaN,10.0 +#first line comment +1,2.,4. # first end line comment +# second line comment +3,5.,7.#second end line comment +6.,NaN,10.0 """ - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data), comment='#') - assert_almost_equal(df.values, expected) - - df = self.read_table(StringIO(data), sep=',', comment='#', - na_values=['NaN']) - assert_almost_equal(df.values, expected) + expected = { + 'c': [[np.nan, np.nan, np.nan], + [1., 2., 4.], + [np.nan, np.nan, np.nan], + [3., 5., 7.], + [6., np.nan, 10.]], + 'python': [[1., 2., 4.], + [3., 5., 7.], + [6., np.nan, 10.]] + } + for engine in ('c', 'python'): + df = self.read_csv(StringIO(data), comment='#', engine=engine) + assert_almost_equal(df.values, expected[engine]) + + df = self.read_table(StringIO(data), sep=',', comment='#', + na_values=['NaN'], engine=engine) + assert_almost_equal(df.values, expected[engine]) def test_bool_na_values(self): data = """A,B,C