Skip to content

Commit 1a46dba

Browse files
alysivjijreback
authored andcommitted
Add nrows parameter to pandas.read_excel() (#18507)
1 parent e658148 commit 1a46dba

File tree

4 files changed

+172
-72
lines changed

4 files changed

+172
-72
lines changed

doc/source/whatsnew/v0.22.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ Other Enhancements
135135
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
136136
- :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`).
137137
- :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`)
138-
138+
- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`)
139139

140140
.. _whatsnew_0220.api_breaking:
141141

@@ -188,6 +188,7 @@ Other API Changes
188188
- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`)
189189
- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`)
190190
- Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`)
191+
- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`)
191192

192193
.. _whatsnew_0220.deprecations:
193194

pandas/io/excel.py

+140-70
Original file line numberDiff line numberDiff line change
@@ -70,31 +70,37 @@
7070
* None -> All sheets as a dictionary of DataFrames
7171
7272
sheetname : string, int, mixed list of strings/ints, or None, default 0
73+
7374
.. deprecated:: 0.21.0
7475
Use `sheet_name` instead
7576
7677
header : int, list of ints, default 0
7778
Row (0-indexed) to use for the column labels of the parsed
7879
DataFrame. If a list of integers is passed those row positions will
7980
be combined into a ``MultiIndex``. Use None if there is no header.
80-
skiprows : list-like
81-
Rows to skip at the beginning (0-indexed)
82-
skip_footer : int, default 0
83-
Rows at the end to skip (0-indexed)
81+
names : array-like, default None
82+
List of column names to use. If file contains no header row,
83+
then you should explicitly pass header=None
8484
index_col : int, list of ints, default None
8585
Column (0-indexed) to use as the row labels of the DataFrame.
8686
Pass None if there is no such column. If a list is passed,
8787
those columns will be combined into a ``MultiIndex``. If a
8888
subset of data is selected with ``usecols``, index_col
8989
is based on the subset.
90-
names : array-like, default None
91-
List of column names to use. If file contains no header row,
92-
then you should explicitly pass header=None
93-
converters : dict, default None
94-
Dict of functions for converting values in certain columns. Keys can
95-
either be integers or column labels, values are functions that take one
96-
input argument, the Excel cell content, and return the transformed
97-
content.
90+
parse_cols : int or list, default None
91+
92+
.. deprecated:: 0.21.0
93+
Pass in `usecols` instead.
94+
95+
usecols : int or list, default None
96+
* If None then parse all columns,
97+
* If int then indicates last column to be parsed
98+
* If list of ints then indicates list of column numbers to be parsed
99+
* If string then indicates comma separated list of Excel column letters and
100+
column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
101+
both sides.
102+
squeeze : boolean, default False
103+
If the parsed data only contains one column then return a Series
98104
dtype : Type name or dict of column -> type, default None
99105
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
100106
Use `object` to preserve data as stored in Excel and not interpret dtype.
@@ -103,6 +109,14 @@
103109
104110
.. versionadded:: 0.20.0
105111
112+
engine: string, default None
113+
If io is not a buffer or path, this must be set to identify io.
114+
Acceptable values are None or xlrd
115+
converters : dict, default None
116+
Dict of functions for converting values in certain columns. Keys can
117+
either be integers or column labels, values are functions that take one
118+
input argument, the Excel cell content, and return the transformed
119+
content.
106120
true_values : list, default None
107121
Values to consider as True
108122
@@ -113,36 +127,29 @@
113127
114128
.. versionadded:: 0.19.0
115129
116-
parse_cols : int or list, default None
117-
.. deprecated:: 0.21.0
118-
Pass in `usecols` instead.
130+
skiprows : list-like
131+
Rows to skip at the beginning (0-indexed)
132+
nrows : int, default None
133+
Number of rows to parse
134+
135+
.. versionadded:: 0.22.0
119136
120-
usecols : int or list, default None
121-
* If None then parse all columns,
122-
* If int then indicates last column to be parsed
123-
* If list of ints then indicates list of column numbers to be parsed
124-
* If string then indicates comma separated list of Excel column letters and
125-
column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
126-
both sides.
127-
squeeze : boolean, default False
128-
If the parsed data only contains one column then return a Series
129137
na_values : scalar, str, list-like, or dict, default None
130138
Additional strings to recognize as NA/NaN. If dict passed, specific
131139
per-column NA values. By default the following values are interpreted
132140
as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'.
133-
thousands : str, default None
134-
Thousands separator for parsing string columns to numeric. Note that
135-
this parameter is only necessary for columns stored as TEXT in Excel,
136-
any numeric columns will automatically be parsed, regardless of display
137-
format.
138141
keep_default_na : bool, default True
139142
If na_values are specified and keep_default_na is False the default NaN
140143
values are overridden, otherwise they're appended to.
141144
verbose : boolean, default False
142145
Indicate number of NA values placed in non-numeric columns
143-
engine: string, default None
144-
If io is not a buffer or path, this must be set to identify io.
145-
Acceptable values are None or xlrd
146+
thousands : str, default None
147+
Thousands separator for parsing string columns to numeric. Note that
148+
this parameter is only necessary for columns stored as TEXT in Excel,
149+
any numeric columns will automatically be parsed, regardless of display
150+
format.
151+
skip_footer : int, default 0
152+
Rows at the end to skip (0-indexed)
146153
convert_float : boolean, default True
147154
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
148155
data will be read in as floats: Excel stores all numbers as floats
@@ -193,12 +200,27 @@ def get_writer(engine_name):
193200

194201
@Appender(_read_excel_doc)
195202
@deprecate_kwarg("parse_cols", "usecols")
196-
def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
197-
index_col=None, names=None, usecols=None, parse_dates=False,
198-
date_parser=None, na_values=None, thousands=None,
199-
convert_float=True, converters=None, dtype=None,
200-
true_values=None, false_values=None, engine=None,
201-
squeeze=False, **kwds):
203+
def read_excel(io,
204+
sheet_name=0,
205+
header=0,
206+
names=None,
207+
index_col=None,
208+
usecols=None,
209+
squeeze=False,
210+
dtype=None,
211+
engine=None,
212+
converters=None,
213+
true_values=None,
214+
false_values=None,
215+
skiprows=None,
216+
nrows=None,
217+
na_values=None,
218+
parse_dates=False,
219+
date_parser=None,
220+
thousands=None,
221+
skip_footer=0,
222+
convert_float=True,
223+
**kwds):
202224

203225
# Can't use _deprecate_kwarg since sheetname=None has a special meaning
204226
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
@@ -213,12 +235,25 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
213235
io = ExcelFile(io, engine=engine)
214236

215237
return io._parse_excel(
216-
sheetname=sheet_name, header=header, skiprows=skiprows, names=names,
217-
index_col=index_col, usecols=usecols, parse_dates=parse_dates,
218-
date_parser=date_parser, na_values=na_values, thousands=thousands,
219-
convert_float=convert_float, skip_footer=skip_footer,
220-
converters=converters, dtype=dtype, true_values=true_values,
221-
false_values=false_values, squeeze=squeeze, **kwds)
238+
sheetname=sheet_name,
239+
header=header,
240+
names=names,
241+
index_col=index_col,
242+
usecols=usecols,
243+
squeeze=squeeze,
244+
dtype=dtype,
245+
converters=converters,
246+
true_values=true_values,
247+
false_values=false_values,
248+
skiprows=skiprows,
249+
nrows=nrows,
250+
na_values=na_values,
251+
parse_dates=parse_dates,
252+
date_parser=date_parser,
253+
thousands=thousands,
254+
skip_footer=skip_footer,
255+
convert_float=convert_float,
256+
**kwds)
222257

223258

224259
class ExcelFile(object):
@@ -282,31 +317,49 @@ def __init__(self, io, **kwds):
282317
def __fspath__(self):
283318
return self._io
284319

285-
def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0,
286-
names=None, index_col=None, usecols=None, parse_dates=False,
287-
date_parser=None, na_values=None, thousands=None,
288-
convert_float=True, converters=None, true_values=None,
289-
false_values=None, squeeze=False, **kwds):
320+
def parse(self,
321+
sheet_name=0,
322+
header=0,
323+
names=None,
324+
index_col=None,
325+
usecols=None,
326+
squeeze=False,
327+
converters=None,
328+
true_values=None,
329+
false_values=None,
330+
skiprows=None,
331+
nrows=None,
332+
na_values=None,
333+
parse_dates=False,
334+
date_parser=None,
335+
thousands=None,
336+
skip_footer=0,
337+
convert_float=True,
338+
**kwds):
290339
"""
291340
Parse specified sheet(s) into a DataFrame
292341
293342
Equivalent to read_excel(ExcelFile, ...) See the read_excel
294343
docstring for more info on accepted parameters
295344
"""
296345

297-
return self._parse_excel(sheetname=sheet_name, header=header,
298-
skiprows=skiprows, names=names,
346+
return self._parse_excel(sheetname=sheet_name,
347+
header=header,
348+
names=names,
299349
index_col=index_col,
300350
usecols=usecols,
351+
squeeze=squeeze,
352+
converters=converters,
353+
true_values=true_values,
354+
false_values=false_values,
355+
skiprows=skiprows,
356+
nrows=nrows,
357+
na_values=na_values,
301358
parse_dates=parse_dates,
302-
date_parser=date_parser, na_values=na_values,
359+
date_parser=date_parser,
303360
thousands=thousands,
304361
skip_footer=skip_footer,
305362
convert_float=convert_float,
306-
converters=converters,
307-
true_values=true_values,
308-
false_values=false_values,
309-
squeeze=squeeze,
310363
**kwds)
311364

312365
def _should_parse(self, i, usecols):
@@ -342,12 +395,26 @@ def _excel2num(x):
342395
else:
343396
return i in usecols
344397

345-
def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None,
346-
skip_footer=0, index_col=None, usecols=None,
347-
parse_dates=False, date_parser=None, na_values=None,
348-
thousands=None, convert_float=True, true_values=None,
349-
false_values=None, verbose=False, dtype=None,
350-
squeeze=False, **kwds):
398+
def _parse_excel(self,
399+
sheetname=0,
400+
header=0,
401+
names=None,
402+
index_col=None,
403+
usecols=None,
404+
squeeze=False,
405+
dtype=None,
406+
true_values=None,
407+
false_values=None,
408+
skiprows=None,
409+
nrows=None,
410+
na_values=None,
411+
verbose=False,
412+
parse_dates=False,
413+
date_parser=None,
414+
thousands=None,
415+
skip_footer=0,
416+
convert_float=True,
417+
**kwds):
351418

352419
skipfooter = kwds.pop('skipfooter', None)
353420
if skipfooter is not None:
@@ -509,21 +576,24 @@ def _parse_cell(cell_contents, cell_typ):
509576

510577
# GH 12292 : error when read one empty column from excel file
511578
try:
512-
parser = TextParser(data, header=header, index_col=index_col,
579+
parser = TextParser(data,
580+
header=header,
581+
index_col=index_col,
513582
has_index_names=has_index_names,
514-
na_values=na_values,
515-
thousands=thousands,
516-
parse_dates=parse_dates,
517-
date_parser=date_parser,
583+
squeeze=squeeze,
584+
dtype=dtype,
518585
true_values=true_values,
519586
false_values=false_values,
520587
skiprows=skiprows,
588+
nrows=nrows,
589+
na_values=na_values,
590+
parse_dates=parse_dates,
591+
date_parser=date_parser,
592+
thousands=thousands,
521593
skipfooter=skip_footer,
522-
squeeze=squeeze,
523-
dtype=dtype,
524594
**kwds)
525595

526-
output[asheetname] = parser.read()
596+
output[asheetname] = parser.read(nrows=nrows)
527597
if names is not None:
528598
output[asheetname].columns = names
529599
if not squeeze or isinstance(output[asheetname], DataFrame):

pandas/io/parsers.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ def _read(filepath_or_buffer, kwds):
440440
# Extract some of the arguments (pass chunksize on).
441441
iterator = kwds.get('iterator', False)
442442
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
443-
nrows = _validate_integer('nrows', kwds.get('nrows', None))
443+
nrows = kwds.get('nrows', None)
444444

445445
# Check for duplicates in names.
446446
_validate_names(kwds.get("names", None))
@@ -1062,6 +1062,8 @@ def _failover_to_python(self):
10621062
raise AbstractMethodError(self)
10631063

10641064
def read(self, nrows=None):
1065+
nrows = _validate_integer('nrows', nrows)
1066+
10651067
if nrows is not None:
10661068
if self.options.get('skipfooter'):
10671069
raise ValueError('skipfooter not supported for iteration')

pandas/tests/io/test_excel.py

+27
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,33 @@ def test_read_excel_skiprows_list(self):
10171017
'skiprows_list', skiprows=np.array([0, 2]))
10181018
tm.assert_frame_equal(actual, expected)
10191019

1020+
def test_read_excel_nrows(self):
1021+
# GH 16645
1022+
num_rows_to_pull = 5
1023+
actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
1024+
nrows=num_rows_to_pull)
1025+
expected = pd.read_excel(os.path.join(self.dirpath,
1026+
'test1' + self.ext))
1027+
expected = expected[:num_rows_to_pull]
1028+
tm.assert_frame_equal(actual, expected)
1029+
1030+
def test_read_excel_nrows_greater_than_nrows_in_file(self):
1031+
# GH 16645
1032+
expected = pd.read_excel(os.path.join(self.dirpath,
1033+
'test1' + self.ext))
1034+
num_records_in_file = len(expected)
1035+
num_rows_to_pull = num_records_in_file + 10
1036+
actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
1037+
nrows=num_rows_to_pull)
1038+
tm.assert_frame_equal(actual, expected)
1039+
1040+
def test_read_excel_nrows_non_integer_parameter(self):
1041+
# GH 16645
1042+
msg = "'nrows' must be an integer >=0"
1043+
with tm.assert_raises_regex(ValueError, msg):
1044+
pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
1045+
nrows='5')
1046+
10201047
def test_read_excel_squeeze(self):
10211048
# GH 12157
10221049
f = os.path.join(self.dirpath, 'test_squeeze' + self.ext)

0 commit comments

Comments
 (0)