From e4fb9ed16442beace0f0b431a1e198799dc008bb Mon Sep 17 00:00:00 2001
From: Jesse Johnson <johnson@biomaps.rutgers.edu>
Date: Mon, 5 Aug 2013 15:22:38 -0400
Subject: [PATCH 1/2] ENH/BUG: ignore line comments in CSV files GH2685

* also fix bug in CSV format sniffer
---
 pandas/io/parsers.py          | 42 ++++++++++++++++++++++++++---------
 pandas/src/parser/tokenizer.c |  5 ++---
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 3b132be800cb1..a620363a4ae17 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -991,7 +991,6 @@ def __init__(self, src, **kwds):
                 self._name_processed = True
                 (index_names, self.names,
                  self.index_col) = _clean_index_names(self.names, self.index_col)
-
                 if self.index_names is None:
                     self.index_names = index_names
 
@@ -1100,7 +1099,6 @@ def _get_index_names(self):
         if self._reader.leading_cols == 0 and self.index_col is not None:
             (idx_names, names,
              self.index_col) = _clean_index_names(names, self.index_col)
-
         return names, idx_names
 
     def _maybe_parse_dates(self, values, index, try_parse_dates=True):
@@ -1282,9 +1280,8 @@ class MyDialect(csv.Dialect):
 
             sniff_sep = True
 
-            if sep is not None:
+            if (sep is not None) and (dia.quotechar is not None):
                 sniff_sep = False
-                dia.delimiter = sep
             # attempt to sniff the delimiter
             if sniff_sep:
                 line = f.readline()
@@ -1292,11 +1289,21 @@ class MyDialect(csv.Dialect):
                     self.pos += 1
                     line = f.readline()
 
-                line = self._check_comments([line])[0]
+                line = self._check_comments([[line]])
+
+                while not line:
+                    self.pos += 1
+                    line = f.readline()
+                    line = self._check_comments([[line]])
+
+                line = line[0][0]
 
                 self.pos += 1
                 sniffed = csv.Sniffer().sniff(line)
-                dia.delimiter = sniffed.delimiter
+                if not dia.delimiter:
+                    dia.delimiter = sniffed.delimiter
+                if not dia.quotechar:
+                    dia.quotechar = sniffed.quotechar
                 if self.encoding is not None:
                     self.buf.extend(list(
                         com.UnicodeReader(StringIO(line),
@@ -1466,14 +1473,26 @@ def _next_line(self):
                 line = self.data[self.pos]
             except IndexError:
                 raise StopIteration
+
+            line = self._check_comments([line])
+
+            while not line:
+                self.pos += 1
+                try:
+                    line = self.data[self.pos]
+                except IndexError:
+                    raise StopIteration
+                line = self._check_comments([line])
+
+            line = line[0]
         else:
             while self.pos in self.skiprows:
                 next(self.data)
                 self.pos += 1
 
             line = next(self.data)
+            line = self._check_comments([line])[0]
 
-        line = self._check_comments([line])[0]
         line = self._check_thousands([line])[0]
 
         self.pos += 1
@@ -1496,7 +1515,10 @@ def _check_comments(self, lines):
                     if len(x) > 0:
                         rl.append(x)
                     break
-            ret.append(rl)
+            if rl:
+                ret.append(rl)
+        if not ret:
+            ret = [[]];
         return ret
 
     def _check_thousands(self, lines):
@@ -1524,7 +1546,7 @@ def _clear_buffer(self):
     def _get_index_name(self, columns):
         orig_names = list(columns)
         columns = list(columns)
-
+        
         try:
             line = self._next_line()
         except StopIteration:
@@ -1539,7 +1561,7 @@ def _get_index_name(self, columns):
 
         # implicitly index_col=0 b/c 1 fewer column names
         implicit_first_cols = 0
-        if line is not None:
+        if line and (line is not None):
             # leave it 0, #2442
             if self.index_col is not False:
                 implicit_first_cols = len(line) - len(columns)
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
index cad5d98dde53a..dc784be43e4bd 100644
--- a/pandas/src/parser/tokenizer.c
+++ b/pandas/src/parser/tokenizer.c
@@ -823,7 +823,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
             }
             else if (c == self->delimiter) {
                 // End of field. End of line not reached yet
-
                 END_FIELD();
                 self->state = START_FIELD;
             }
@@ -866,7 +865,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
             } else {
                 /* \r line terminator */
 
-                /* UGH. we don't actually want to consume the token. fix this later */
+                /*FIXME UGH. we don't actually want to consume the token. */
                 self->stream_len = slen;
                 if (end_line(self) < 0) {
                     goto parsingerror;
@@ -875,7 +874,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
                 slen = self->stream_len;
                 self->state = START_RECORD;
 
-                /* HACK, let's try this one again */
+                /*FIXME let's try this one again */
                 --i; buf--;
                 if (line_limit > 0 && self->lines == start_lines + line_limit) {
                     goto linelimit;

From d680f13fd41fcd8c42a5597b93be924eec0e5153 Mon Sep 17 00:00:00 2001
From: Jesse Johnson <johnson@biomaps.rutgers.edu>
Date: Mon, 12 Aug 2013 14:41:43 -0400
Subject: [PATCH 2/2] TST: add test for CSV parser line comments

---
 pandas/io/tests/test_parsers.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 41345352b5ec5..d10feb7ac7c98 100644
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -1527,17 +1527,29 @@ def test_multiple_date_col_multiple_index(self):
 
     def test_comment(self):
         data = """A,B,C
-1,2.,4.#hello world
-5.,NaN,10.0
+#first line comment
+1,2.,4. # first end line comment
+# second line comment
+3,5.,7.#second end line comment
+6.,NaN,10.0
 """
-        expected = [[1., 2., 4.],
-                    [5., np.nan, 10.]]
-        df = self.read_csv(StringIO(data), comment='#')
-        assert_almost_equal(df.values, expected)
-
-        df = self.read_table(StringIO(data), sep=',', comment='#',
-                             na_values=['NaN'])
-        assert_almost_equal(df.values, expected)
+        expected = {
+            'c': [[np.nan, np.nan, np.nan],
+                  [1., 2., 4.],
+                  [np.nan, np.nan, np.nan],
+                  [3., 5., 7.],
+                  [6., np.nan, 10.]],
+            'python': [[1., 2., 4.],
+                       [3., 5., 7.],
+                       [6., np.nan, 10.]]
+            }
+        for engine in ('c', 'python'):
+            df = self.read_csv(StringIO(data), comment='#', engine=engine)
+            assert_almost_equal(df.values, expected[engine])
+
+            df = self.read_table(StringIO(data), sep=',', comment='#',
+                                 na_values=['NaN'], engine=engine)
+            assert_almost_equal(df.values, expected[engine])
 
     def test_bool_na_values(self):
         data = """A,B,C