From a2aff4c9bba4ba5b1b0dfdf38a6cb3bfc0802f63 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Fri, 27 Jun 2014 08:34:16 -0400 Subject: [PATCH] Squashed commit of the following: commit 5e9e0fa29d727953583a116638a9d0db81f9ed21 Author: Michael Mueller Date: Thu Jun 26 19:53:35 2014 -0400 Fixed issue with empty lines commit 57b54918b251ab77f000b575d77bcce3affcb27a Author: Michael Mueller Date: Thu Jun 26 16:31:27 2014 -0400 Added reference to new functionality in docs commit a2371638691584416439d3c6a4dd2ef1829dcbe3 Author: Michael Mueller Date: Thu Jun 26 16:26:06 2014 -0400 Implemented functionality to ignore comment lines, wrote a test --- doc/source/io.rst | 35 ++++++++-- doc/source/v0.14.1.txt | 6 +- pandas/io/parsers.py | 48 +++++++++---- pandas/io/tests/test_parsers.py | 118 ++++++++++++++++++++++++++++++++ pandas/parser.pyx | 2 + pandas/src/parser/tokenizer.c | 22 ++++++ pandas/src/parser/tokenizer.h | 2 + 7 files changed, 211 insertions(+), 22 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index bc58b04de4473..0f698306a6517 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -98,8 +98,10 @@ They can take a number of arguments: data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly pass ``header=0`` to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns - E.g. [0,1,3]. Intervening rows that are not specified will be skipped. - (E.g. 2 in this example are skipped) + E.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example are skipped). Note that this parameter + ignores commented lines, so header=0 denotes the first line of + data rather than the first line of the file. - ``skiprows``: A collection of numbers for rows in the file to skip. Can also be an integer to skip the first ``n`` rows - ``index_col``: column number, column name, or list of column numbers/names, @@ -145,8 +147,12 @@ They can take a number of arguments: Acceptable values are 0, 1, 2, and 3 for QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONE, and QUOTE_NONNUMERIC, respectively. - ``skipinitialspace`` : boolean, default ``False``, Skip spaces after delimiter - ``escapechar`` : string, to specify how to escape quoted data - - ``comment``: denotes the start of a comment and ignores the rest of the line. - Currently line commenting is not supported. + - ``comment``: Indicates remainder of line should not be parsed. If found at the + beginning of a line, the line will be ignored altogether. This parameter + must be a single character. Also, fully commented lines + are ignored by the parameter `header` but not by `skiprows`. For example, + if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will + result in '1,2,3' being treated as the header. - ``nrows``: Number of rows to read out of the file. Useful to only read a small portion of a large file - ``iterator``: If True, return a ``TextFileReader`` to enable reading a file @@ -252,6 +258,27 @@ after a delimiter: data = 'a, b, c\n1, 2, 3\n4, 5, 6' print(data) pd.read_csv(StringIO(data), skipinitialspace=True) + +Moreover, ``read_csv`` ignores any completely commented lines: + +.. ipython:: python + + data = 'a,b,c\n# commented line\n1,2,3\n#another comment\n4,5,6' + print(data) + pd.read_csv(StringIO(data), comment='#') + +.. note:: + + The presence of ignored lines might create ambiguities involving line numbers; + the parameter ``header`` uses row numbers (ignoring commented + lines), while ``skiprows`` uses line numbers (including commented lines): + + .. ipython:: python + + data = '#comment\na,b,c\nA,B,C\n1,2,3' + pd.read_csv(StringIO(data), comment='#', header=1) + data = 'A,B,C\n#comment\na,b,c\n1,2,3' + pd.read_csv(StringIO(data), comment='#', skiprows=2) The parsers make every attempt to "do the right thing" and not be very fragile. Type inference is a pretty big deal. So if a column can be coerced to diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 45a5d55ca047d..abe2505b5adf7 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -102,9 +102,9 @@ Enhancements - - - +- The file parsers ``read_csv`` and ``read_table`` now ignore line comments provided by + the parameter `comment`, which accepts only a single character for the C reader. + In particular, they allow for comments before file data begins (:issue:`2685`) - Tests for basic reading of public S3 buckets now exist (:issue:`7281`). - ``read_html`` now sports an ``encoding`` argument that is passed to the underlying parser library. You can use this to read non-ascii encoded web diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 22fe3ef16e34d..3e4155491fc9c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -64,9 +64,11 @@ class ParserWarning(Warning): pass ``header=0`` to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns E.g. [0,1,3]. Intervening rows that are not specified will be - skipped. (E.g. 2 in this example are skipped) + skipped (e.g. 2 in this example are skipped). Note that this parameter + ignores commented lines, so header=0 denotes the first line of + data rather than the first line of the file. skiprows : list-like or integer - Row numbers to skip (0-indexed) or number of rows to skip (int) + Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a @@ -106,8 +108,12 @@ class ParserWarning(Warning): thousands : str, default None Thousands separator comment : str, default None - Indicates remainder of line should not be parsed - Does not support line commenting (will return empty line) + Indicates remainder of line should not be parsed. If found at the + beginning of a line, the line will be ignored altogether. This parameter + must be a single character. Also, fully commented lines + are ignored by the parameter `header` but not by `skiprows`. For example, + if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will + result in '1,2,3' being treated as the header. decimal : str, default '.' Character to recognize as decimal point. E.g. use ',' for European data nrows : int, default None @@ -1313,6 +1319,7 @@ def __init__(self, f, **kwds): self.data = None self.buf = [] self.pos = 0 + self.line_pos = 0 self.encoding = kwds['encoding'] self.compression = kwds['compression'] @@ -1459,6 +1466,7 @@ class MyDialect(csv.Dialect): line = self._check_comments([line])[0] self.pos += 1 + self.line_pos += 1 sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter if self.encoding is not None: @@ -1566,7 +1574,7 @@ def _infer_columns(self): if self.header is not None: header = self.header - # we have a mi columns, so read and extra line + # we have a mi columns, so read an extra line if isinstance(header, (list, tuple, np.ndarray)): have_mi_columns = True header = list(header) + [header[-1] + 1] @@ -1578,9 +1586,8 @@ def _infer_columns(self): for level, hr in enumerate(header): line = self._buffered_line() - while self.pos <= hr: + while self.line_pos <= hr: line = self._next_line() - unnamed_count = 0 this_columns = [] for i, c in enumerate(line): @@ -1705,25 +1712,36 @@ def _buffered_line(self): else: return self._next_line() + def _empty(self, line): + return not line or all(not x for x in line) + def _next_line(self): if isinstance(self.data, list): while self.pos in self.skiprows: self.pos += 1 - try: - line = self.data[self.pos] - except IndexError: - raise StopIteration + while True: + try: + line = self._check_comments([self.data[self.pos]])[0] + self.pos += 1 + # either uncommented or blank to begin with + if self._empty(self.data[self.pos - 1]) or line: + break + except IndexError: + raise StopIteration else: while self.pos in self.skiprows: next(self.data) self.pos += 1 - line = next(self.data) - - line = self._check_comments([line])[0] + while True: + orig_line = next(self.data) + line = self._check_comments([orig_line])[0] + self.pos += 1 + if self._empty(orig_line) or line: + break - self.pos += 1 + self.line_pos += 1 self.buf.append(line) return line diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index c02a3172f4adc..5f219d86ecff3 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1584,6 +1584,65 @@ def test_read_table_buglet_4x_multiindex(self): df = self.read_table(StringIO(text), sep='\s+') self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows(self): + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # this should ignore the first four lines (including comments) + df = self.read_csv(StringIO(data), comment='#', skiprows=4) + tm.assert_almost_equal(df.values, expected) + + def test_comment_header(self): + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # header should begin at the second non-comment line + df = self.read_csv(StringIO(data), comment='#', header=1) + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows_header(self): + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # skiprows should skip the first 4 lines (including comments), while + # header should start from the second non-commented line starting + # with line 5 + df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + tm.assert_almost_equal(df.values, expected) + def test_read_csv_parse_simple_list(self): text = """foo bar baz @@ -2874,6 +2933,65 @@ def test_parse_dates_empty_string(self): def test_usecols(self): raise nose.SkipTest("Usecols is not supported in C High Memory engine.") + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows(self): + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # this should ignore the first four lines (including comments) + df = self.read_csv(StringIO(data), comment='#', skiprows=4) + tm.assert_almost_equal(df.values, expected) + + def test_comment_header(self): + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # header should begin at the second non-comment line + df = self.read_csv(StringIO(data), comment='#', header=1) + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows_header(self): + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # skiprows should skip the first 4 lines (including comments), while + # header should start from the second non-commented line starting + # with line 5 + df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + tm.assert_almost_equal(df.values, expected) + def test_passing_dtype(self): # GH 6607 # This is a copy which should eventually be merged into ParserTests diff --git a/pandas/parser.pyx b/pandas/parser.pyx index f303298e88273..199d4ab44abfa 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -78,8 +78,10 @@ cdef extern from "parser/tokenizer.h": ESCAPE_IN_QUOTED_FIELD QUOTE_IN_QUOTED_FIELD EAT_CRNL + EAT_CRNL_NOP EAT_WHITESPACE EAT_COMMENT + EAT_LINE_COMMENT FINISHED enum: ERROR_OVERFLOW diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index f3da2175092e7..1e9576487b9ed 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -698,6 +698,9 @@ int tokenize_delimited(parser_t *self, size_t line_limit) } else if (c == '\r') { self->state = EAT_CRNL; break; + } else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; + break; } /* normal character - handle as START_FIELD */ @@ -752,6 +755,16 @@ int tokenize_delimited(parser_t *self, size_t line_limit) self->state = IN_FIELD; break; + case EAT_LINE_COMMENT: + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; + } else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; + case IN_FIELD: /* in unquoted field */ if (c == '\n') { @@ -883,6 +896,15 @@ int tokenize_delimited(parser_t *self, size_t line_limit) } break; + case EAT_CRNL_NOP: /* inside an ignored comment line */ + self->state = START_RECORD; + /* \r line terminator -- parse this character again */ + if (c != '\n' && c != self->delimiter) { + --i; + --buf; + } + break; + default: break; diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 4e40d892a8b4a..6af63c07f1104 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -121,8 +121,10 @@ typedef enum { ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD, EAT_CRNL, + EAT_CRNL_NOP, EAT_WHITESPACE, EAT_COMMENT, + EAT_LINE_COMMENT, FINISHED } ParserState;