Skip to content

Commit 9472428

Browse files
committed
ENH: handle zip file. pass test suite
1 parent 4a218da commit 9472428

File tree

4 files changed

+25
-20
lines changed

4 files changed

+25
-20
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ pandas/src/tseries.c
88
pandas/src/sparse.c
99
pandas/version.py
1010
doc/source/generated
11+
doc/source/_static
1112
*flymake*
1213
scikits
1314
.coverage

RELEASE.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ feedback on the library.
138138
- Add more helpful error message when importing pandas post-installation from
139139
the source directory (GH #250)
140140

141+
141142
**Bug fixes**
142143

143144
- Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should
@@ -165,6 +166,8 @@ feedback on the library.
165166
- Passing column names should force `header=None` (GH #257)
166167
- Don't modify passed column names when `index_col` is not
167168
None (GH #258)
169+
- Can sniff CSV separator in zip file (since seek is not supported, was
170+
failing before)
168171

169172
Thanks
170173
------
@@ -291,6 +294,8 @@ infrastructure are the main new additions
291294
retrieve groups
292295
- Added informative Exception when passing dict to DataFrame groupby
293296
aggregation with axis != 0
297+
- Significantly speed up DataFrame `__repr__` and `count` on large mixed-type
298+
DataFrame objects
294299

295300
**API Changes**
296301

pandas/io/parsers.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
Module contains tools for processing files into DataFrames or other objects
33
"""
44

5+
from StringIO import StringIO
6+
57
import numpy as np
68

79
from pandas.core.index import Index, MultiIndex
@@ -31,10 +33,12 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
3133
dia.delimiter = sep
3234
# attempt to sniff the delimiter
3335
if sniff_sep:
34-
sample = f.readline()
35-
sniffed = csv.Sniffer().sniff(sample)
36+
line = f.readline()
37+
sniffed = csv.Sniffer().sniff(line)
3638
dia.delimiter = sniffed.delimiter
37-
f.seek(0)
39+
buf = list(csv.reader(StringIO(line), dialect=dia))
40+
else:
41+
buf = []
3842

3943
reader = csv.reader(f, dialect=dia)
4044

@@ -46,7 +50,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
4650
parse_dates=parse_dates,
4751
date_parser=date_parser,
4852
skiprows=skiprows,
49-
chunksize=chunksize)
53+
chunksize=chunksize, buf=buf)
5054

5155
if nrows is not None:
5256
return parser.get_chunk(nrows)
@@ -144,17 +148,18 @@ class TextParser(object):
144148

145149
def __init__(self, data, names=None, header=0, index_col=None,
146150
na_values=None, parse_dates=False, date_parser=None,
147-
chunksize=None, skiprows=None):
151+
chunksize=None, skiprows=None, buf=None):
148152
"""
149153
Workhorse function for processing nested list into DataFrame
150154
151155
Should be replaced by np.genfromtxt eventually?
152156
"""
153157
self.data = data
154158

155-
self.buf = []
159+
# can pass rows read so far
160+
self.buf = [] if buf is None else buf
161+
self.pos = len(self.buf)
156162

157-
self.pos = 0
158163
self.names = list(names) if names is not None else names
159164
self.header = header
160165
self.index_col = index_col
@@ -179,7 +184,10 @@ def _infer_columns(self):
179184
self.header = None
180185

181186
if self.header is not None:
182-
line = self._next_line()
187+
if len(self.buf) > 0:
188+
line = self.buf[0]
189+
else:
190+
line = self._next_line()
183191
while self.header > self.pos:
184192
line = self._next_line()
185193

@@ -196,17 +204,16 @@ def _infer_columns(self):
196204
if cur_count > 0:
197205
columns[i] = '%s.%d' % (col, cur_count)
198206
counts[col] = cur_count + 1
207+
self._clear_buffer()
199208
else:
200209
line = self._next_line()
201-
self.buf.append(line)
202210

203211
ncols = len(line)
204212
if not names:
205213
columns = ['X.%d' % (i + 1) for i in range(ncols)]
206214
else:
207215
columns = names
208216

209-
self._clear_buffer()
210217

211218
return columns
212219

@@ -435,16 +442,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
435442
datemode = self.book.datemode
436443
sheet = self.book.sheet_by_name(sheetname)
437444

438-
if skiprows is None:
439-
skiprows = set()
440-
else:
441-
skiprows = set(skiprows)
442-
443445
data = []
444446
for i in range(sheet.nrows):
445-
if i in skiprows:
446-
continue
447-
448447
row = []
449448
for value, typ in zip(sheet.row_values(i), sheet.row_types(i)):
450449
if typ == XL_CELL_DATE:

pandas/io/tests/test_parsers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,8 @@ def test_header_with_index_col(self):
128128

129129
self.assertEqual(names, ['A', 'B', 'C'])
130130

131-
data = [[1,2,3],[4,5,6],[7,8,9]]
132-
expected = DataFrame(data, index=['foo','bar','baz'],
131+
values = [[1,2,3],[4,5,6],[7,8,9]]
132+
expected = DataFrame(values, index=['foo','bar','baz'],
133133
columns=['A','B','C'])
134134
assert_frame_equal(df, expected)
135135

0 commit comments

Comments
 (0)