diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 9c39fac13b230..70cc160cb4904 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -154,9 +154,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype SA01" \ -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.DatetimeTZDtype.unit SA01" \ - -i "pandas.ExcelFile PR01,SA01" \ - -i "pandas.ExcelFile.parse PR01,SA01" \ - -i "pandas.ExcelWriter SA01" \ -i "pandas.Float32Dtype SA01" \ -i "pandas.Float64Dtype SA01" \ -i "pandas.Grouper PR02,SA01" \ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a9da95054b81a..2b35cfa044ae9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -979,6 +979,12 @@ class ExcelWriter(Generic[_WorkbookT]): .. versionadded:: 1.3.0 + See Also + -------- + read_excel : Read an Excel sheet values (xlsx) file into DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Notes ----- For compatibility with CSV writers, ExcelWriter serializes lists @@ -1434,6 +1440,7 @@ def inspect_excel_format( return "zip" +@doc(storage_options=_shared_docs["storage_options"]) class ExcelFile: """ Class for parsing tabular Excel sheets into DataFrame objects. @@ -1472,19 +1479,27 @@ class ExcelFile: - Otherwise if ``path_or_buffer`` is in xlsb format, `pyxlsb `_ will be used. - .. versionadded:: 1.3.0 + .. versionadded:: 1.3.0 - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - .. warning:: + .. warning:: - Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. - This is not supported, switch to using ``openpyxl`` instead. + Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. + This is not supported, switch to using ``openpyxl`` instead. + {storage_options} engine_kwargs : dict, optional Arbitrary keyword arguments passed to excel engine. + See Also + -------- + DataFrame.to_excel : Write DataFrame to an Excel file. + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Examples -------- >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP @@ -1595,11 +1610,134 @@ def parse( Equivalent to read_excel(ExcelFile, ...) See the read_excel docstring for more info on accepted parameters. + Parameters + ---------- + sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions (chart sheets do not count as a sheet position). + Lists of strings/integers are used to request multiple sheets. + Specify ``None`` to get all worksheets. + header : int, list of int, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use None if there is no header. + names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None. + index_col : int, str, list of int, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex``. If a + subset of data is selected with ``usecols``, index_col + is based on the subset. + + Missing values will be forward filled to allow roundtripping with + ``to_excel`` for ``merged_cells=True``. To avoid forward filling the + missing values use ``set_index`` after reading the data instead of + ``index_col``. + usecols : str, list-like, or callable, default None + * If None, then parse all columns. + * If str, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. + * If list of int, then indicates list of column numbers to be parsed + (0-indexed). + * If list of string, then indicates list of column names to be parsed. + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + Returns a subset of the columns according to behavior above. + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. + true_values : list, default None + Values to consider as True. + false_values : list, default None + Values to consider as False. + skiprows : list-like, int, or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) at the + start of the file. If callable, the callable function will be evaluated + against the row indices, returning True if the row should be skipped and + False otherwise. An example of a valid callable argument would be ``lambda + x: x in [0, 2]``. + nrows : int, default None + Number of rows to parse. + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. + parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * ``bool``. If True -> try parsing the index. + * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and + parse as a single date column. + * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparsable date, the entire column or + index will be returned unaltered as an object data type. If you + don`t want to parse some cells as date just change their type + in Excel to "Text".For non-standard datetime parsing, use + ``pd.to_datetime`` after ``pd.read_excel``. + + Note: A fast-path exists for iso8601-formatted dates. + date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`to_datetime` as-needed. + date_format : str or dict of column -> format, default ``None`` + If used in conjunction with ``parse_dates``, will parse dates + according to this format. For anything more complex, + please read in as ``object`` and then apply :func:`to_datetime` as-needed. + thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. + comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. + skipfooter : int, default 0 + Rows at the end to skip (0-indexed). + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + **kwds : dict, optional + Arbitrary keyword arguments passed to excel engine. + Returns ------- DataFrame or dict of DataFrames DataFrame from the passed in Excel file. + See Also + -------- + read_excel : Read an Excel sheet values (xlsx) file into DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Examples -------- >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])