Skip to content

Commit 70c3deb

Browse files
author
Chang She
committed
ENH: selectively parse columns in ExcelFile.parse #873
1 parent d719ce4 commit 70c3deb

File tree

3 files changed

+99
-18
lines changed

3 files changed

+99
-18
lines changed

doc/source/io.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,24 @@ additional arguments as the parsers above:
714714
To read sheets from an Excel 2007 file, you can pass a filename with a ``.xlsx``
715715
extension, in which case the ``openpyxl`` module will be used to read the file.
716716

717+
It is often the case that users will insert columns to do temporary computations
718+
in Excel and you may not want to read in those columns. `ExcelFile.parse` takes
719+
a `parse_cols` keyword to allow you to specify a subset of columns to parse.
720+
721+
If `parse_cols` is an integer, then it is assumed to indicate the last column
722+
to be parsed.
723+
724+
.. code-block:: python
725+
726+
xls.parse('Sheet1', parse_cols=2, index_col=None, na_values=['NA'])
727+
728+
If `parse_cols` is a list of integers, then it is assumed to be the file column
729+
indices to be parsed.
730+
731+
.. code-block:: python
732+
733+
xls.parse('Sheet1', parse_cols=[0, 2, 3], index_col=None, na_values=['NA'])
734+
717735
To write a DataFrame object to a sheet of an Excel file, you can use the
718736
``to_excel`` instance method. The arguments are largely the same as ``to_csv``
719737
described above, the first argument being the name of the excel file, and the

pandas/io/parsers.py

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,8 +1208,8 @@ def __repr__(self):
12081208
return object.__repr__(self)
12091209

12101210
def parse(self, sheetname, header=0, skiprows=None, index_col=None,
1211-
parse_dates=False, date_parser=None, na_values=None,
1212-
thousands=None, chunksize=None):
1211+
parse_cols=None, parse_dates=False, date_parser=None,
1212+
na_values=None, thousands=None, chunksize=None):
12131213
"""
12141214
Read Excel table into DataFrame
12151215
@@ -1224,6 +1224,10 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
12241224
index_col : int, default None
12251225
Column to use as the row labels of the DataFrame. Pass None if
12261226
there is no such column
1227+
parse_cols : int or list, default None
1228+
If None then parse all columns,
1229+
If int then indicates last column to be parsed
1230+
If list of ints then indicates list of column numbers to be parsed
12271231
na_values : list-like, default None
12281232
List of additional strings to recognize as NA/NaN
12291233
@@ -1235,21 +1239,38 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
12351239
False:self._parse_xls}
12361240
return choose[self.use_xlsx](sheetname, header=header,
12371241
skiprows=skiprows, index_col=index_col,
1242+
parse_cols=parse_cols,
12381243
parse_dates=parse_dates,
12391244
date_parser=date_parser,
12401245
na_values=na_values,
12411246
thousands=thousands,
12421247
chunksize=chunksize)
12431248

1249+
def _should_parse(self, i, parse_cols):
1250+
if isinstance(parse_cols, int):
1251+
return i <= parse_cols
1252+
else:
1253+
return i in parse_cols
1254+
12441255
def _parse_xlsx(self, sheetname, header=0, skiprows=None, index_col=None,
1245-
parse_dates=False, date_parser=None, na_values=None,
1246-
thousands=None, chunksize=None):
1256+
parse_cols=None, parse_dates=False, date_parser=None,
1257+
na_values=None, thousands=None, chunksize=None):
12471258
sheet = self.book.get_sheet_by_name(name=sheetname)
12481259
data = []
12491260

12501261
# it brings a new method: iter_rows()
1262+
should_parse = {}
1263+
12511264
for row in sheet.iter_rows():
1252-
data.append([cell.internal_value for cell in row])
1265+
row_data = []
1266+
for j, cell in enumerate(row):
1267+
1268+
if parse_cols is not None and j not in should_parse:
1269+
should_parse[j] = self._should_parse(j, parse_cols)
1270+
1271+
if parse_cols is None or should_parse[j]:
1272+
row_data.append(cell.internal_value)
1273+
data.append(row_data)
12531274

12541275
if header is not None:
12551276
data[header] = _trim_excel_header(data[header])
@@ -1265,28 +1286,34 @@ def _parse_xlsx(self, sheetname, header=0, skiprows=None, index_col=None,
12651286
return parser.get_chunk()
12661287

12671288
def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None,
1268-
parse_dates=False, date_parser=None, na_values=None,
1269-
thousands=None, chunksize=None):
1289+
parse_cols=None, parse_dates=False, date_parser=None,
1290+
na_values=None, thousands=None, chunksize=None):
12701291
from datetime import MINYEAR, time, datetime
12711292
from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR
12721293

12731294
datemode = self.book.datemode
12741295
sheet = self.book.sheet_by_name(sheetname)
12751296

12761297
data = []
1298+
should_parse = {}
12771299
for i in range(sheet.nrows):
12781300
row = []
1279-
for value, typ in izip(sheet.row_values(i), sheet.row_types(i)):
1280-
if typ == XL_CELL_DATE:
1281-
dt = xldate_as_tuple(value, datemode)
1282-
# how to produce this first case?
1283-
if dt[0] < MINYEAR: # pragma: no cover
1284-
value = time(*dt[3:])
1285-
else:
1286-
value = datetime(*dt)
1287-
if typ == XL_CELL_ERROR:
1288-
value = np.nan
1289-
row.append(value)
1301+
for j, (value, typ) in enumerate(izip(sheet.row_values(i),
1302+
sheet.row_types(i))):
1303+
if parse_cols is not None and j not in should_parse:
1304+
should_parse[j] = self._should_parse(j, parse_cols)
1305+
1306+
if parse_cols is None or should_parse[j]:
1307+
if typ == XL_CELL_DATE:
1308+
dt = xldate_as_tuple(value, datemode)
1309+
# how to produce this first case?
1310+
if dt[0] < MINYEAR: # pragma: no cover
1311+
value = time(*dt[3:])
1312+
else:
1313+
value = datetime(*dt)
1314+
if typ == XL_CELL_ERROR:
1315+
value = np.nan
1316+
row.append(value)
12901317
data.append(row)
12911318

12921319
if header is not None:

pandas/io/tests/test_parsers.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,42 @@ def test_xlsx_table(self):
746746
assert_frame_equal(df, df2)
747747
assert_frame_equal(df3, df2)
748748

749+
def test_parse_cols_int(self):
750+
_skip_if_no_openpyxl()
751+
752+
suffix = ['', 'x']
753+
754+
for s in suffix:
755+
pth = os.path.join(self.dirpath, 'test.xls%s' % s)
756+
xls = ExcelFile(pth)
757+
df = xls.parse('Sheet1', index_col=0, parse_dates=True,
758+
parse_cols=3)
759+
df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
760+
df2 = df2.reindex(columns=['A', 'B', 'C'])
761+
df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
762+
parse_dates=True, parse_cols=3)
763+
assert_frame_equal(df, df2)
764+
assert_frame_equal(df3, df2)
765+
766+
def test_parse_cols_list(self):
767+
_skip_if_no_openpyxl()
768+
769+
suffix = ['', 'x']
770+
771+
for s in suffix:
772+
773+
pth = os.path.join(self.dirpath, 'test.xls%s' % s)
774+
xlsx = ExcelFile(pth)
775+
df = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
776+
parse_cols=[0, 2, 3])
777+
df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
778+
df2 = df2.reindex(columns=['B', 'C'])
779+
df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0,
780+
parse_dates=True,
781+
parse_cols=[0, 2, 3])
782+
assert_frame_equal(df, df2)
783+
assert_frame_equal(df3, df2)
784+
749785
def test_read_table_wrong_num_columns(self):
750786
data = """A,B,C,D,E,F
751787
1,2,3,4,5

0 commit comments

Comments
 (0)