diff --git a/.gitignore b/.gitignore index ff0a6aef47163..6ce10492a7d55 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ .ipynb_checkpoints .tags .cache/ +.vscode/ # Compiled source # ################### diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3dd8bb2ac2de5..bf6a0945e95b2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -38,6 +38,7 @@ Other Enhancements - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins=True. (:issue:`15972`) +- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) .. _whatsnew_0210.api_breaking: @@ -51,6 +52,7 @@ Backwards incompatible API changes raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) +- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`) .. _whatsnew_0210.api: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index a4d2fabf76a41..fbf84c35d952e 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -78,22 +78,15 @@ Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will be combined into a ``MultiIndex`` -skiprows : list-like - Rows to skip at the beginning (0-indexed) -skip_footer : int, default 0 - Rows at the end to skip (0-indexed) +names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None index_col : int, list of ints, default None Column (0-indexed) to use as the row labels of the DataFrame. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex`` -names : array-like, default None - List of column names to use. If file contains no header row, - then you should explicitly pass header=None -converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed - content. +squeeze : boolean, default False + If the parsed data only contains one column then return a Series dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} Use `str` or `object` to preserve and not interpret dtype. @@ -102,6 +95,14 @@ .. versionadded:: 0.20.0 +engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd +converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. true_values : list, default None Values to consider as True @@ -112,35 +113,64 @@ .. versionadded:: 0.19.0 +skiprows : list-like + Rows to skip at the beginning (0-indexed) +nrows : int, default None + Number of rows to parse + + .. versionadded:: 0.21.0 + +na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted + as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'. parse_cols : int or list, default None * If None then parse all columns, * If int then indicates last column to be parsed * If list of ints then indicates list of column numbers to be parsed * If string then indicates comma separated list of column names and column ranges (e.g. "A:E" or "A,C,E:F") -squeeze : boolean, default False - If the parsed data only contains one column then return a Series -na_values : scalar, str, list-like, or dict, default None - Additional strings to recognize as NA/NaN. If dict passed, specific - per-column NA values. By default the following values are interpreted - as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'. +parse_dates : boolean or list of ints or names or list of lists or dict, \ +default False + + * boolean. If True -> try parsing the index. + * list of ints or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result + 'foo' + + If a column or index contains an unparseable date, the entire column or + index will be returned unaltered as an object data type. For non-standard + datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv`` + + Note: A fast-path exists for iso8601-formatted dates. +date_parser : function, default None + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call date_parser in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by parse_dates into a single array + and pass that; and 3) call date_parser once for each row using one or more + strings (corresponding to the columns defined by parse_dates) as arguments. thousands : str, default None Thousands separator for parsing string columns to numeric. Note that this parameter is only necessary for columns stored as TEXT in Excel, any numeric columns will automatically be parsed, regardless of display format. +skip_footer : int, default 0 + Rows at the end to skip (0-indexed) +convert_float : boolean, default True + convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. verbose : boolean, default False Indicate number of NA values placed in non-numeric columns -engine: string, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd -convert_float : boolean, default True - convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats - internally Returns ------- @@ -191,12 +221,12 @@ def get_writer(engine_name): @Appender(_read_excel_doc) -def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, - index_col=None, names=None, parse_cols=None, parse_dates=False, - date_parser=None, na_values=None, thousands=None, - convert_float=True, converters=None, dtype=None, - true_values=None, false_values=None, engine=None, - squeeze=False, **kwds): +def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, + squeeze=False, dtype=None, engine=None, converters=None, + true_values=None, false_values=None, skiprows=None, nrows=None, + na_values=None, parse_cols=None, parse_dates=False, + date_parser=None, thousands=None, skip_footer=0, + convert_float=True, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: @@ -211,12 +241,12 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, io = ExcelFile(io, engine=engine) return io._parse_excel( - sheetname=sheet_name, header=header, skiprows=skiprows, names=names, - index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates, - date_parser=date_parser, na_values=na_values, thousands=thousands, - convert_float=convert_float, skip_footer=skip_footer, - converters=converters, dtype=dtype, true_values=true_values, - false_values=false_values, squeeze=squeeze, **kwds) + sheetname=sheet_name, header=header, names=names, index_col=index_col, + squeeze=squeeze, dtype=dtype, converters=converters, + true_values=true_values, false_values=false_values, skiprows=skiprows, + nrows=nrows, na_values=na_values, parse_cols=parse_cols, + parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, + skip_footer=skip_footer, convert_float=convert_float, **kwds) class ExcelFile(object): @@ -275,11 +305,11 @@ def __init__(self, io, **kwds): def __fspath__(self): return self._io - def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, - names=None, index_col=None, parse_cols=None, parse_dates=False, - date_parser=None, na_values=None, thousands=None, - convert_float=True, converters=None, true_values=None, - false_values=None, squeeze=False, **kwds): + def parse(self, sheet_name=0, header=0, names=None, index_col=None, + squeeze=False, converters=None, true_values=None, + false_values=None, skiprows=None, nrows=None, na_values=None, + parse_cols=None, parse_dates=False, date_parser=None, + thousands=None, skip_footer=0, convert_float=True, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -287,19 +317,23 @@ def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, docstring for more info on accepted parameters """ - return self._parse_excel(sheetname=sheet_name, header=header, - skiprows=skiprows, names=names, + return self._parse_excel(sheetname=sheet_name, + header=header, + names=names, index_col=index_col, + squeeze=squeeze, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrow=nrows, + na_values=na_values, parse_cols=parse_cols, parse_dates=parse_dates, - date_parser=date_parser, na_values=na_values, + date_parser=date_parser, thousands=thousands, skip_footer=skip_footer, convert_float=convert_float, - converters=converters, - true_values=true_values, - false_values=false_values, - squeeze=squeeze, **kwds) def _should_parse(self, i, parse_cols): @@ -335,12 +369,12 @@ def _excel2num(x): else: return i in parse_cols - def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, - skip_footer=0, index_col=None, parse_cols=None, - parse_dates=False, date_parser=None, na_values=None, - thousands=None, convert_float=True, true_values=None, - false_values=None, verbose=False, dtype=None, - squeeze=False, **kwds): + def _parse_excel(self, sheetname=0, header=0, names=None, index_col=None, + squeeze=False, dtype=None, true_values=None, + false_values=None, skiprows=None, nrows=None, + na_values=None, parse_cols=None, parse_dates=False, + date_parser=None, thousands=None, skip_footer=0, + convert_float=True, verbose=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: @@ -502,21 +536,24 @@ def _parse_cell(cell_contents, cell_typ): # GH 12292 : error when read one empty column from excel file try: - parser = TextParser(data, header=header, index_col=index_col, + parser = TextParser(data, + header=header, + index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, - true_values=true_values, - false_values=false_values, skiprows=skiprows, + nrows=nrows, skipfooter=skip_footer, squeeze=squeeze, + true_values=true_values, + false_values=false_values, dtype=dtype, **kwds) - output[asheetname] = parser.read() + output[asheetname] = parser.read(nrows=nrows) if names is not None: output[asheetname].columns = names if not squeeze or isinstance(output[asheetname], DataFrame): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c2d5a629b03a3..2ba263b9b4cd4 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -399,7 +399,7 @@ def _read(filepath_or_buffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get('iterator', False) chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) - nrows = _validate_integer('nrows', kwds.get('nrows', None)) + nrows = kwds.get('nrows', None) # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) @@ -998,6 +998,8 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): + nrows = _validate_integer('nrows', nrows) + if nrows is not None: if self.options.get('skipfooter'): raise ValueError('skipfooter not supported for iteration') @@ -1893,6 +1895,8 @@ def TextParser(*args, **kwds): date_parser : function, default None skiprows : list of integers Row numbers to skip + nrows : int, default None + Number of rows to parse skipfooter : int Number of line at bottom of file to skip converters : dict, default None diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index abe3757ec64f3..b1fb8568c40cd 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1000,6 +1000,33 @@ def test_read_excel_skiprows_list(self): 'skiprows_list', skiprows=np.array([0, 2])) tm.assert_frame_equal(actual, expected) + def test_read_excel_nrows(self): + # GH 16645 + num_rows_to_pull = 5 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + nrows=num_rows_to_pull) + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + self.ext)) + expected = expected[:num_rows_to_pull] + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_greater_than_nrows_in_file(self): + # GH 16645 + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + self.ext)) + num_records_in_file = len(expected) + num_rows_to_pull = num_records_in_file + 10 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + nrows=num_rows_to_pull) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_non_integer_parameter(self): + # GH 16645 + msg = "'nrows' must be an integer >=0" + with tm.assert_raises_regex(ValueError, msg): + pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + nrows='5') + def test_read_excel_squeeze(self): # GH 12157 f = os.path.join(self.dirpath, 'test_squeeze' + self.ext)