ENH: handle zip file. pass test suite

wesm · wesm · commit 94724284a940 · 2011-10-20T10:45:02.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ pandas/src/tseries.c
 pandas/src/sparse.c
 pandas/version.py
 doc/source/generated
+doc/source/_static
 *flymake*
 scikits
 .coverage
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -138,6 +138,7 @@ feedback on the library.
   - Add more helpful error message when importing pandas post-installation from
     the source directory (GH #250)
 
+
 **Bug fixes**
 
   - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should
@@ -165,6 +166,8 @@ feedback on the library.
     - Passing column names should force `header=None` (GH #257)
     - Don't modify passed column names when `index_col` is not
       None (GH #258)
+    - Can sniff CSV separator in zip file (since seek is not supported, was
+      failing before)
 
 Thanks
 ------
@@ -291,6 +294,8 @@ infrastructure are the main new additions
     retrieve groups
   - Added informative Exception when passing dict to DataFrame groupby
     aggregation with axis != 0
+  - Significantly speed up DataFrame `__repr__` and `count` on large mixed-type
+    DataFrame objects
 
 **API Changes**
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2,6 +2,8 @@
 Module contains tools for processing files into DataFrames or other objects
 """
 
+from StringIO import StringIO
+
 import numpy as np
 
 from pandas.core.index import Index, MultiIndex
@@ -31,10 +33,12 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
         dia.delimiter = sep
     # attempt to sniff the delimiter
     if sniff_sep:
-        sample = f.readline()
-        sniffed = csv.Sniffer().sniff(sample)
+        line = f.readline()
+        sniffed = csv.Sniffer().sniff(line)
         dia.delimiter = sniffed.delimiter
-        f.seek(0)
+        buf = list(csv.reader(StringIO(line), dialect=dia))
+    else:
+        buf = []
 
     reader = csv.reader(f, dialect=dia)
 
@@ -46,7 +50,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
                         parse_dates=parse_dates,
                         date_parser=date_parser,
                         skiprows=skiprows,
-                        chunksize=chunksize)
+                        chunksize=chunksize, buf=buf)
 
     if nrows is not None:
         return parser.get_chunk(nrows)
@@ -144,17 +148,18 @@ class TextParser(object):
 
     def __init__(self, data, names=None, header=0, index_col=None,
                  na_values=None, parse_dates=False, date_parser=None,
-                 chunksize=None, skiprows=None):
+                 chunksize=None, skiprows=None, buf=None):
         """
         Workhorse function for processing nested list into DataFrame
 
         Should be replaced by np.genfromtxt eventually?
         """
         self.data = data
 
-        self.buf = []
+        # can pass rows read so far
+        self.buf = [] if buf is None else buf
+        self.pos = len(self.buf)
 
-        self.pos = 0
         self.names = list(names) if names is not None else names
         self.header = header
         self.index_col = index_col
@@ -179,7 +184,10 @@ def _infer_columns(self):
             self.header = None
 
         if self.header is not None:
-            line = self._next_line()
+            if len(self.buf) > 0:
+                line = self.buf[0]
+            else:
+                line = self._next_line()
             while self.header > self.pos:
                 line = self._next_line()
 
@@ -196,17 +204,16 @@ def _infer_columns(self):
                 if cur_count > 0:
                     columns[i] = '%s.%d' % (col, cur_count)
                 counts[col] = cur_count + 1
+            self._clear_buffer()
         else:
             line = self._next_line()
-            self.buf.append(line)
 
             ncols = len(line)
             if not names:
                 columns = ['X.%d' % (i + 1) for i in range(ncols)]
             else:
                 columns = names
 
-        self._clear_buffer()
 
         return columns
 
@@ -435,16 +442,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
         datemode = self.book.datemode
         sheet = self.book.sheet_by_name(sheetname)
 
-        if skiprows is None:
-            skiprows = set()
-        else:
-            skiprows = set(skiprows)
-
         data = []
         for i in range(sheet.nrows):
-            if i in skiprows:
-                continue
-
             row = []
             for value, typ in zip(sheet.row_values(i), sheet.row_types(i)):
                 if typ == XL_CELL_DATE:
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -128,8 +128,8 @@ def test_header_with_index_col(self):
 
         self.assertEqual(names, ['A', 'B', 'C'])
 
-        data = [[1,2,3],[4,5,6],[7,8,9]]
-        expected = DataFrame(data, index=['foo','bar','baz'],
+        values = [[1,2,3],[4,5,6],[7,8,9]]
+        expected = DataFrame(values, index=['foo','bar','baz'],
                              columns=['A','B','C'])
         assert_frame_equal(df, expected)