Skip to content

Commit 0754624

Browse files
author
Chang She
committed
ENH: add comments keyword to text readers. #962
1 parent 9d87add commit 0754624

File tree

3 files changed

+101
-6
lines changed

3 files changed

+101
-6
lines changed

pandas/io/parsers.py

+54-3
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ def next(x):
5555
DD/MM format dates, international and European format
5656
thousands : str, default None
5757
Thousands separator
58+
comment : str, default None
59+
Indicates remainder of line should not be parsed
5860
nrows : int, default None
5961
Number of rows of file to read. Useful for reading pieces of large files
6062
iterator : boolean, default False
@@ -179,6 +181,7 @@ def read_csv(filepath_or_buffer,
179181
skiprows=None,
180182
na_values=None,
181183
thousands=None,
184+
comment=None,
182185
parse_dates=False,
183186
dayfirst=False,
184187
date_parser=None,
@@ -208,6 +211,7 @@ def read_table(filepath_or_buffer,
208211
skiprows=None,
209212
na_values=None,
210213
thousands=None,
214+
comment=None,
211215
parse_dates=False,
212216
dayfirst=False,
213217
date_parser=None,
@@ -241,6 +245,7 @@ def read_fwf(filepath_or_buffer,
241245
skiprows=None,
242246
na_values=None,
243247
thousands=None,
248+
comment=None,
244249
parse_dates=False,
245250
dayfirst=False,
246251
date_parser=None,
@@ -339,6 +344,10 @@ class TextParser(object):
339344
Column or columns to use as the (possibly hierarchical) index
340345
na_values : iterable, default None
341346
Custom NA values
347+
thousands : str, default None
348+
Thousands separator
349+
comment : str, default None
350+
Comment out remainder of line
342351
parse_dates : boolean, default False
343352
date_parser : function, default None
344353
skiprows : list of integers
@@ -351,7 +360,7 @@ class TextParser(object):
351360

352361
def __init__(self, f, delimiter=None, names=None, header=0,
353362
index_col=None, na_values=None, thousands=None,
354-
parse_dates=False,
363+
comment=None, parse_dates=False,
355364
date_parser=None, dayfirst=False, chunksize=None,
356365
skiprows=None, skip_footer=0, converters=None,
357366
verbose=False, encoding=None):
@@ -398,6 +407,7 @@ def __init__(self, f, delimiter=None, names=None, header=0,
398407
self.na_values = set(list(na_values)) | _NA_VALUES
399408

400409
self.thousands = thousands
410+
self.comment = comment
401411

402412
if hasattr(f, 'readline'):
403413
self._make_reader(f)
@@ -430,6 +440,12 @@ def _make_reader(self, f):
430440
self.pos += 1
431441
line = f.readline()
432442

443+
while self._is_commented(line):
444+
self.pos += 1
445+
line = f.readline()
446+
447+
line = self._check_comments([line])[0]
448+
433449
self.pos += 1
434450
sniffed = csv.Sniffer().sniff(line)
435451
dia.delimiter = sniffed.delimiter
@@ -498,22 +514,56 @@ def _next_line(self):
498514
self.pos += 1
499515

500516
try:
501-
line = self.data[self.pos]
517+
while True:
518+
line = self.data[self.pos]
519+
if not self._is_commented(line):
520+
break
521+
self.pos += 1
502522
except IndexError:
503523
raise StopIteration
504524
else:
505525
while self.pos in self.skiprows:
506526
next(self.data)
507527
self.pos += 1
508-
line = next(self.data)
509528

529+
while True:
530+
line = next(self.data)
531+
if not self._is_commented(line):
532+
break
533+
self.pos += 1
534+
535+
line = self._check_comments([line])[0]
510536
line = self._check_thousands([line])[0]
511537

512538
self.pos += 1
513539
self.buf.append(line)
514540

515541
return line
516542

543+
def _is_commented(self, line):
544+
if self.comment is None or len(line) == 0:
545+
return False
546+
return line[0].startswith(self.comment)
547+
548+
def _check_comments(self, lines):
549+
if self.comment is None:
550+
return lines
551+
ret = []
552+
for l in lines:
553+
rl = []
554+
for x in l:
555+
if (not isinstance(x, basestring) or
556+
self.comment not in x):
557+
rl.append(x)
558+
else:
559+
x = x[:x.find(self.comment)]
560+
if len(x) > 0:
561+
rl.append(x)
562+
break
563+
if len(rl) > 0:
564+
ret.append(rl)
565+
return ret
566+
517567
def _check_thousands(self, lines):
518568
if self.thousands is None:
519569
return lines
@@ -730,6 +780,7 @@ def _get_lines(self, rows=None):
730780
if self.skip_footer:
731781
lines = lines[:-self.skip_footer]
732782

783+
lines = self._check_comments(lines)
733784
return self._check_thousands(lines)
734785

735786
def _convert_to_ndarrays(dct, na_values, verbose=False):

pandas/io/tests/test_parsers.py

+26
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,32 @@ def test_1000_fwf(self):
6666
thousands=',')
6767
assert_almost_equal(df.values, expected)
6868

69+
def test_comment(self):
70+
data = """A,B,C
71+
1,2.,4.#hello world
72+
#hello self
73+
5.,NaN,10.0
74+
"""
75+
expected = [[1., 2., 4.],
76+
[5., np.nan, 10.]]
77+
df = read_csv(StringIO(data), comment='#')
78+
assert_almost_equal(df.values, expected)
79+
80+
df = read_table(StringIO(data), sep=',', comment='#', na_values=['NaN'])
81+
assert_almost_equal(df.values, expected)
82+
83+
def test_comment_fwf(self):
84+
data = """
85+
1 2. 4 #hello world
86+
#hello self
87+
5 NaN 10.0
88+
"""
89+
expected = [[1, 2., 4],
90+
[5, np.nan, 10.]]
91+
df = read_fwf(StringIO(data), colspecs=[(0,3),(4,9),(9,25)],
92+
comment='#')
93+
assert_almost_equal(df.values, expected)
94+
6995
def test_custom_na_values(self):
7096
data = """A,B,C
7197
ignore,this,row

vb_suite/parser.py

+21-3
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
N = 10000
1111
K = 8
1212
df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)))
13-
df.to_csv('test.csv')
13+
df.to_csv('test.csv', sep='|')
1414
"""
1515

16-
read_csv_vb = Benchmark("read_csv('test.csv')", setup,
16+
read_csv_vb = Benchmark("read_csv('test.csv', sep='|')", setup,
1717
cleanup="os.remove('test.csv')",
1818
start_date=datetime(2012, 5, 7))
1919

@@ -29,6 +29,24 @@
2929
df.to_csv('test.csv', sep='|')
3030
"""
3131

32-
read_csv_thou_vb = Benchmark("read_csv('test.csv')", setup,
32+
read_csv_thou_vb = Benchmark("read_csv('test.csv', sep='|', thousands=',')",
33+
setup,
3334
cleanup="os.remove('test.csv')",
3435
start_date=datetime(2012, 5, 7))
36+
37+
setup = common_setup + """
38+
from pandas import read_csv
39+
import os
40+
N = 10000
41+
K = 8
42+
format = lambda x: '%f' % x
43+
df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)))
44+
df = df.applymap(format)
45+
df.ix[:5, 0] = '#'
46+
df.to_csv('test.csv', sep='|')
47+
"""
48+
49+
read_csv_comment_vb = Benchmark("read_csv('test.csv', sep='|', comment='#')",
50+
setup,
51+
cleanup="os.remove('test.csv')",
52+
start_date=datetime(2012, 5, 7))

0 commit comments

Comments
 (0)