diff --git a/doc/source/io.rst b/doc/source/io.rst index 8fe5685b33aff..0aa4ea72e3b13 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2800,21 +2800,21 @@ Parsing Specific Columns It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. `read_excel` takes -a `parse_cols` keyword to allow you to specify a subset of columns to parse. +a `usecols` keyword to allow you to specify a subset of columns to parse. -If `parse_cols` is an integer, then it is assumed to indicate the last column +If `usecols` is an integer, then it is assumed to indicate the last column to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', parse_cols=2) + read_excel('path_to_file.xls', 'Sheet1', usecols=2) -If `parse_cols` is a list of integers, then it is assumed to be the file column +If `usecols` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) + read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) Parsing Dates diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 61c05d1b226e0..52b8437ec98b1 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -658,6 +658,7 @@ Deprecations ~~~~~~~~~~~~ - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- :func:`read_excel()` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`) - The ``convert`` parameter has been deprecated in the ``.take()`` method, as it was not being respected (:issue:`16948`) - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). - :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 41e3b5283a532..c8d0e42a022ba 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -31,7 +31,7 @@ import pandas.compat.openpyxl_compat as openpyxl_compat from warnings import warn from distutils.version import LooseVersion -from pandas.util._decorators import Appender +from pandas.util._decorators import Appender, deprecate_kwarg from textwrap import fill __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] @@ -86,7 +86,7 @@ Column (0-indexed) to use as the row labels of the DataFrame. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex``. If a - subset of data is selected with ``parse_cols``, index_col + subset of data is selected with ``usecols``, index_col is based on the subset. names : array-like, default None List of column names to use. If file contains no header row, @@ -115,6 +115,10 @@ .. versionadded:: 0.19.0 parse_cols : int or list, default None + .. deprecated:: 0.21.0 + Pass in `usecols` instead. + +usecols : int or list, default None * If None then parse all columns, * If int then indicates last column to be parsed * If list of ints then indicates list of column numbers to be parsed @@ -205,8 +209,9 @@ def get_writer(engine_name): @Appender(_read_excel_doc) +@deprecate_kwarg("parse_cols", "usecols") def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, - index_col=None, names=None, parse_cols=None, parse_dates=False, + index_col=None, names=None, usecols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, converters=None, dtype=None, true_values=None, false_values=None, engine=None, @@ -226,7 +231,7 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, return io._parse_excel( sheetname=sheet_name, header=header, skiprows=skiprows, names=names, - index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates, + index_col=index_col, usecols=usecols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, thousands=thousands, convert_float=convert_float, skip_footer=skip_footer, converters=converters, dtype=dtype, true_values=true_values, @@ -295,7 +300,7 @@ def __fspath__(self): return self._io def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, - names=None, index_col=None, parse_cols=None, parse_dates=False, + names=None, index_col=None, usecols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, converters=None, true_values=None, false_values=None, squeeze=False, **kwds): @@ -309,7 +314,7 @@ def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, return self._parse_excel(sheetname=sheet_name, header=header, skiprows=skiprows, names=names, index_col=index_col, - parse_cols=parse_cols, + usecols=usecols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, thousands=thousands, @@ -321,7 +326,7 @@ def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, squeeze=squeeze, **kwds) - def _should_parse(self, i, parse_cols): + def _should_parse(self, i, usecols): def _range2cols(areas): """ @@ -347,15 +352,15 @@ def _excel2num(x): cols.append(_excel2num(rng)) return cols - if isinstance(parse_cols, int): - return i <= parse_cols - elif isinstance(parse_cols, compat.string_types): - return i in _range2cols(parse_cols) + if isinstance(usecols, int): + return i <= usecols + elif isinstance(usecols, compat.string_types): + return i in _range2cols(usecols) else: - return i in parse_cols + return i in usecols def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, - skip_footer=0, index_col=None, parse_cols=None, + skip_footer=0, index_col=None, usecols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, true_values=None, false_values=None, verbose=False, dtype=None, @@ -470,10 +475,10 @@ def _parse_cell(cell_contents, cell_typ): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): - if parse_cols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, parse_cols) + if usecols is not None and j not in should_parse: + should_parse[j] = self._should_parse(j, usecols) - if parse_cols is None or should_parse[j]: + if usecols is None or should_parse[j]: row.append(_parse_cell(value, typ)) data.append(row) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4e25fe0371718..f21f638799e57 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -158,56 +158,74 @@ def setup_method(self, method): self.check_skip() super(ReadingTestsBase, self).setup_method(method) - def test_parse_cols_int(self): + def test_usecols_int(self): dfref = self.get_csv_refdf('test1') dfref = dfref.reindex(columns=['A', 'B', 'C']) - df1 = self.get_exceldf('test1', 'Sheet1', index_col=0, parse_cols=3) + df1 = self.get_exceldf('test1', 'Sheet1', index_col=0, usecols=3) df2 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, - parse_cols=3) + usecols=3) + + with tm.assert_produces_warning(FutureWarning): + df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], + index_col=0, parse_cols=3) + # TODO add index to xls file) tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) + tm.assert_frame_equal(df3, dfref, check_names=False) - def test_parse_cols_list(self): + def test_usecols_list(self): dfref = self.get_csv_refdf('test1') dfref = dfref.reindex(columns=['B', 'C']) df1 = self.get_exceldf('test1', 'Sheet1', index_col=0, - parse_cols=[0, 2, 3]) + usecols=[0, 2, 3]) df2 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, - parse_cols=[0, 2, 3]) + usecols=[0, 2, 3]) + + with tm.assert_produces_warning(FutureWarning): + df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], + index_col=0, parse_cols=[0, 2, 3]) + # TODO add index to xls file) tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) + tm.assert_frame_equal(df3, dfref, check_names=False) - def test_parse_cols_str(self): + def test_usecols_str(self): dfref = self.get_csv_refdf('test1') df1 = dfref.reindex(columns=['A', 'B', 'C']) df2 = self.get_exceldf('test1', 'Sheet1', index_col=0, - parse_cols='A:D') + usecols='A:D') df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, - parse_cols='A:D') + usecols='A:D') + + with tm.assert_produces_warning(FutureWarning): + df4 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], + index_col=0, parse_cols='A:D') + # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df4, df1, check_names=False) df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', 'Sheet1', index_col=0, - parse_cols='A,C,D') + usecols='A,C,D') df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, - parse_cols='A,C,D') + usecols='A,C,D') # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', 'Sheet1', index_col=0, - parse_cols='A,C:D') + usecols='A,C:D') df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, - parse_cols='A,C:D') + usecols='A,C:D') tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) @@ -457,14 +475,14 @@ def test_read_one_empty_col_no_header(self): actual_header_none = read_excel( path, 'no_header', - parse_cols=[0], + usecols=[0], header=None ) actual_header_zero = read_excel( path, 'no_header', - parse_cols=[0], + usecols=[0], header=0 ) expected = DataFrame() @@ -486,14 +504,14 @@ def test_read_one_empty_col_with_header(self): actual_header_none = read_excel( path, 'with_header', - parse_cols=[0], + usecols=[0], header=None ) actual_header_zero = read_excel( path, 'with_header', - parse_cols=[0], + usecols=[0], header=0 ) expected_header_none = DataFrame(pd.Series([0], dtype='int64'))