ENH: selectively parse columns in ExcelFile.parse #873

Chang She · Chang She · commit 70c3deb1fd4a · 2012-07-13T00:53:15.000-04:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -714,6 +714,24 @@ additional arguments as the parsers above:
 To read sheets from an Excel 2007 file, you can pass a filename with a ``.xlsx``
 extension, in which case the ``openpyxl`` module will be used to read the file.
 
+It is often the case that users will insert columns to do temporary computations
+in Excel and you may not want to read in those columns. `ExcelFile.parse` takes
+a `parse_cols` keyword to allow you to specify a subset of columns to parse.
+
+If `parse_cols` is an integer, then it is assumed to indicate the last column
+to be parsed.
+
+.. code-block:: python
+
+   xls.parse('Sheet1', parse_cols=2, index_col=None, na_values=['NA'])
+
+If `parse_cols` is a list of integers, then it is assumed to be the file column
+indices to be parsed.
+
+.. code-block:: python
+
+   xls.parse('Sheet1', parse_cols=[0, 2, 3], index_col=None, na_values=['NA'])
+
 To write a DataFrame object to a sheet of an Excel file, you can use the
 ``to_excel`` instance method.  The arguments are largely the same as ``to_csv``
 described above, the first argument being the name of the excel file, and the
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1208,8 +1208,8 @@ def __repr__(self):
         return object.__repr__(self)
 
     def parse(self, sheetname, header=0, skiprows=None, index_col=None,
-              parse_dates=False, date_parser=None, na_values=None,
-              thousands=None, chunksize=None):
+              parse_cols=None, parse_dates=False, date_parser=None,
+              na_values=None, thousands=None, chunksize=None):
         """
         Read Excel table into DataFrame
 
@@ -1224,6 +1224,10 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
         index_col : int, default None
             Column to use as the row labels of the DataFrame. Pass None if
             there is no such column
+        parse_cols : int or list, default None
+            If None then parse all columns,
+            If int then indicates last column to be parsed
+            If list of ints then indicates list of column numbers to be parsed
         na_values : list-like, default None
             List of additional strings to recognize as NA/NaN
 
@@ -1235,21 +1239,38 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
                   False:self._parse_xls}
         return choose[self.use_xlsx](sheetname, header=header,
                                      skiprows=skiprows, index_col=index_col,
+                                     parse_cols=parse_cols,
                                      parse_dates=parse_dates,
                                      date_parser=date_parser,
                                      na_values=na_values,
                                      thousands=thousands,
                                      chunksize=chunksize)
 
+    def _should_parse(self, i, parse_cols):
+        if isinstance(parse_cols, int):
+            return i <= parse_cols
+        else:
+            return i in parse_cols
+
     def _parse_xlsx(self, sheetname, header=0, skiprows=None, index_col=None,
-                    parse_dates=False, date_parser=None, na_values=None,
-                    thousands=None, chunksize=None):
+                    parse_cols=None, parse_dates=False, date_parser=None,
+                    na_values=None, thousands=None, chunksize=None):
         sheet = self.book.get_sheet_by_name(name=sheetname)
         data = []
 
         # it brings a new method: iter_rows()
+        should_parse = {}
+
         for row in sheet.iter_rows():
-            data.append([cell.internal_value for cell in row])
+            row_data = []
+            for j, cell in enumerate(row):
+
+                if parse_cols is not None and j not in should_parse:
+                    should_parse[j] = self._should_parse(j, parse_cols)
+
+                if parse_cols is None or should_parse[j]:
+                    row_data.append(cell.internal_value)
+            data.append(row_data)
 
         if header is not None:
             data[header] = _trim_excel_header(data[header])
@@ -1265,28 +1286,34 @@ def _parse_xlsx(self, sheetname, header=0, skiprows=None, index_col=None,
         return parser.get_chunk()
 
     def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None,
-                   parse_dates=False, date_parser=None, na_values=None,
-                   thousands=None, chunksize=None):
+                   parse_cols=None, parse_dates=False, date_parser=None,
+                   na_values=None, thousands=None, chunksize=None):
         from datetime import MINYEAR, time, datetime
         from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR
 
         datemode = self.book.datemode
         sheet = self.book.sheet_by_name(sheetname)
 
         data = []
+        should_parse = {}
         for i in range(sheet.nrows):
             row = []
-            for value, typ in izip(sheet.row_values(i), sheet.row_types(i)):
-                if typ == XL_CELL_DATE:
-                    dt = xldate_as_tuple(value, datemode)
-                    # how to produce this first case?
-                    if dt[0] < MINYEAR: # pragma: no cover
-                        value = time(*dt[3:])
-                    else:
-                        value = datetime(*dt)
-                if typ == XL_CELL_ERROR:
-                    value = np.nan
-                row.append(value)
+            for j, (value, typ) in enumerate(izip(sheet.row_values(i),
+                                                  sheet.row_types(i))):
+                if parse_cols is not None and j not in should_parse:
+                    should_parse[j] = self._should_parse(j, parse_cols)
+
+                if parse_cols is None or should_parse[j]:
+                    if typ == XL_CELL_DATE:
+                        dt = xldate_as_tuple(value, datemode)
+                        # how to produce this first case?
+                        if dt[0] < MINYEAR: # pragma: no cover
+                            value = time(*dt[3:])
+                        else:
+                            value = datetime(*dt)
+                    if typ == XL_CELL_ERROR:
+                        value = np.nan
+                    row.append(value)
             data.append(row)
 
         if header is not None:
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -746,6 +746,42 @@ def test_xlsx_table(self):
         assert_frame_equal(df, df2)
         assert_frame_equal(df3, df2)
 
+    def test_parse_cols_int(self):
+        _skip_if_no_openpyxl()
+
+        suffix = ['', 'x']
+
+        for s in suffix:
+            pth = os.path.join(self.dirpath, 'test.xls%s' % s)
+            xls = ExcelFile(pth)
+            df = xls.parse('Sheet1', index_col=0, parse_dates=True,
+                            parse_cols=3)
+            df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
+            df2 = df2.reindex(columns=['A', 'B', 'C'])
+            df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
+                            parse_dates=True, parse_cols=3)
+            assert_frame_equal(df, df2)
+            assert_frame_equal(df3, df2)
+
+    def test_parse_cols_list(self):
+        _skip_if_no_openpyxl()
+
+        suffix = ['', 'x']
+
+        for s in suffix:
+
+            pth = os.path.join(self.dirpath, 'test.xls%s' % s)
+            xlsx = ExcelFile(pth)
+            df = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
+                            parse_cols=[0, 2, 3])
+            df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
+            df2 = df2.reindex(columns=['B', 'C'])
+            df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0,
+                             parse_dates=True,
+                             parse_cols=[0, 2, 3])
+            assert_frame_equal(df, df2)
+            assert_frame_equal(df3, df2)
+
     def test_read_table_wrong_num_columns(self):
         data = """A,B,C,D,E,F
 1,2,3,4,5