2
2
Module contains tools for processing files into DataFrames or other objects
3
3
"""
4
4
5
+ from StringIO import StringIO
6
+
5
7
import numpy as np
6
8
7
9
from pandas .core .index import Index , MultiIndex
@@ -31,10 +33,12 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
31
33
dia .delimiter = sep
32
34
# attempt to sniff the delimiter
33
35
if sniff_sep :
34
- sample = f .readline ()
35
- sniffed = csv .Sniffer ().sniff (sample )
36
+ line = f .readline ()
37
+ sniffed = csv .Sniffer ().sniff (line )
36
38
dia .delimiter = sniffed .delimiter
37
- f .seek (0 )
39
+ buf = list (csv .reader (StringIO (line ), dialect = dia ))
40
+ else :
41
+ buf = []
38
42
39
43
reader = csv .reader (f , dialect = dia )
40
44
@@ -46,7 +50,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
46
50
parse_dates = parse_dates ,
47
51
date_parser = date_parser ,
48
52
skiprows = skiprows ,
49
- chunksize = chunksize )
53
+ chunksize = chunksize , buf = buf )
50
54
51
55
if nrows is not None :
52
56
return parser .get_chunk (nrows )
@@ -144,17 +148,18 @@ class TextParser(object):
144
148
145
149
def __init__ (self , data , names = None , header = 0 , index_col = None ,
146
150
na_values = None , parse_dates = False , date_parser = None ,
147
- chunksize = None , skiprows = None ):
151
+ chunksize = None , skiprows = None , buf = None ):
148
152
"""
149
153
Workhorse function for processing nested list into DataFrame
150
154
151
155
Should be replaced by np.genfromtxt eventually?
152
156
"""
153
157
self .data = data
154
158
155
- self .buf = []
159
+ # can pass rows read so far
160
+ self .buf = [] if buf is None else buf
161
+ self .pos = len (self .buf )
156
162
157
- self .pos = 0
158
163
self .names = list (names ) if names is not None else names
159
164
self .header = header
160
165
self .index_col = index_col
@@ -179,7 +184,10 @@ def _infer_columns(self):
179
184
self .header = None
180
185
181
186
if self .header is not None :
182
- line = self ._next_line ()
187
+ if len (self .buf ) > 0 :
188
+ line = self .buf [0 ]
189
+ else :
190
+ line = self ._next_line ()
183
191
while self .header > self .pos :
184
192
line = self ._next_line ()
185
193
@@ -196,17 +204,16 @@ def _infer_columns(self):
196
204
if cur_count > 0 :
197
205
columns [i ] = '%s.%d' % (col , cur_count )
198
206
counts [col ] = cur_count + 1
207
+ self ._clear_buffer ()
199
208
else :
200
209
line = self ._next_line ()
201
- self .buf .append (line )
202
210
203
211
ncols = len (line )
204
212
if not names :
205
213
columns = ['X.%d' % (i + 1 ) for i in range (ncols )]
206
214
else :
207
215
columns = names
208
216
209
- self ._clear_buffer ()
210
217
211
218
return columns
212
219
@@ -435,16 +442,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
435
442
datemode = self .book .datemode
436
443
sheet = self .book .sheet_by_name (sheetname )
437
444
438
- if skiprows is None :
439
- skiprows = set ()
440
- else :
441
- skiprows = set (skiprows )
442
-
443
445
data = []
444
446
for i in range (sheet .nrows ):
445
- if i in skiprows :
446
- continue
447
-
448
447
row = []
449
448
for value , typ in zip (sheet .row_values (i ), sheet .row_types (i )):
450
449
if typ == XL_CELL_DATE :
0 commit comments