From 020e253c7f6eff26fa02877628e9ee4c5ec65e1a Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 09:55:33 -0500 Subject: [PATCH 01/17] PR09 in pandas.ExcelWriter and pandas.read_excel --- pandas/io/excel/_base.py | 484 ++++++++++++++++++++------------------- 1 file changed, 243 insertions(+), 241 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1f1ad55969d6f..a3faeb3c247a1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -34,245 +34,245 @@ _read_excel_doc = ( """ -Read an Excel file into a pandas DataFrame. - -Support both `xls` and `xlsx` file extensions from a local filesystem or URL. -Support an option to read a single sheet or a list of sheets. - -Parameters ----------- -io : str, ExcelFile, xlrd.Book, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: ``file://localhost/path/to/table.xlsx``. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) - or ``StringIO``. -sheet_name : str, int, list, or None, default 0 - Strings are used for sheet names. Integers are used in zero-indexed - sheet positions. Lists of strings/integers are used to request - multiple sheets. Specify None to get all sheets. - - Available cases: - - * Defaults to ``0``: 1st sheet as a `DataFrame` - * ``1``: 2nd sheet as a `DataFrame` - * ``"Sheet1"``: Load sheet with name "Sheet1" - * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" - as a dict of `DataFrame` - * None: All sheets. - -header : int, list of int, default 0 - Row (0-indexed) to use for the column labels of the parsed - DataFrame. If a list of integers is passed those row positions will - be combined into a ``MultiIndex``. Use None if there is no header. -names : array-like, default None - List of column names to use. If file contains no header row, - then you should explicitly pass header=None. -index_col : int, list of int, default None - Column (0-indexed) to use as the row labels of the DataFrame. - Pass None if there is no such column. If a list is passed, - those columns will be combined into a ``MultiIndex``. If a - subset of data is selected with ``usecols``, index_col - is based on the subset. -usecols : int, str, list-like, or callable default None - Return a subset of the columns. - - * If None, then parse all columns. - * If int, then indicates last column to be parsed. - - .. deprecated:: 0.24.0 - Pass in a list of int instead from 0 to `usecols` inclusive. - - * If str, then indicates comma separated list of Excel column letters - and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of - both sides. - * If list of int, then indicates list of column numbers to be parsed. - * If list of string, then indicates list of column names to be parsed. - - .. versionadded:: 0.24.0 - - * If callable, then evaluate each column name against it and parse the - column if the callable returns ``True``. - - .. versionadded:: 0.24.0 - -squeeze : bool, default False - If the parsed data only contains one column then return a Series. -dtype : Type name or dict of column -> type, default None - Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} - Use `object` to preserve data as stored in Excel and not interpret dtype. - If converters are specified, they will be applied INSTEAD - of dtype conversion. -engine : str, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None, "xlrd", "openpyxl" or "odf". -converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed - content. -true_values : list, default None - Values to consider as True. -false_values : list, default None - Values to consider as False. -skiprows : list-like - Rows to skip at the beginning (0-indexed). -nrows : int, default None - Number of rows to parse. - - .. versionadded:: 0.23.0 - -na_values : scalar, str, list-like, or dict, default None - Additional strings to recognize as NA/NaN. If dict passed, specific - per-column NA values. By default the following values are interpreted - as NaN: '""" - + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") - + """'. -keep_default_na : bool, default True - Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: - - * If `keep_default_na` is True, and `na_values` are specified, `na_values` - is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only - the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no - strings will be parsed as NaN. - - Note that if `na_filter` is passed in as False, the `keep_default_na` and - `na_values` parameters will be ignored. -na_filter : bool, default True - Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing na_filter=False can improve the performance - of reading a large file. -verbose : bool, default False - Indicate number of NA values placed in non-numeric columns. -parse_dates : bool, list-like, or dict, default False - The behavior is as follows: - - * bool. If True -> try parsing the index. - * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 - each as a separate date column. - * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as - a single date column. - * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call - result 'foo' - - If a column or index contains an unparseable date, the entire column or - index will be returned unaltered as an object data type. For non-standard - datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. - - Note: A fast-path exists for iso8601-formatted dates. -date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. -thousands : str, default None - Thousands separator for parsing string columns to numeric. Note that - this parameter is only necessary for columns stored as TEXT in Excel, - any numeric columns will automatically be parsed, regardless of display - format. -comment : str, default None - Comments out remainder of line. Pass a character or characters to this - argument to indicate comments in the input file. Any data between the - comment string and the end of the current line is ignored. -skip_footer : int, default 0 - Alias of `skipfooter`. - - .. deprecated:: 0.23.0 - Use `skipfooter` instead. -skipfooter : int, default 0 - Rows at the end to skip (0-indexed). -convert_float : bool, default True - Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats - internally. -mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. -**kwds : optional + Read an Excel file into a pandas DataFrame. + + Support both `xls` and `xlsx` file extensions from a local filesystem or URL. + Support an option to read a single sheet or a list of sheets. + + Parameters + ---------- + io : str, ExcelFile, xlrd.Book, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.xlsx``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions. Lists of strings/integers are used to request + multiple sheets. Specify None to get all sheets. + + Available cases: + + * Defaults to ``0``: 1st sheet as a `DataFrame` + * ``1``: 2nd sheet as a `DataFrame` + * ``"Sheet1"``: Load sheet with name "Sheet1" + * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" + as a dict of `DataFrame` + * None: All sheets. + + header : int, list of int, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use None if there is no header. + names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None. + index_col : int, list of int, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex``. If a + subset of data is selected with ``usecols``, index_col + is based on the subset. + usecols : int, str, list-like, or callable default None + Return a subset of the columns. + + * If None, then parse all columns. + * If int, then indicates last column to be parsed. + + .. deprecated:: 0.24.0 + Pass in a list of int instead from 0 to `usecols` inclusive. + + * If str, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. + * If list of int, then indicates list of column numbers to be parsed. + * If list of string, then indicates list of column names to be parsed. + + .. versionadded:: 0.24.0 + + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + .. versionadded:: 0.24.0 + + squeeze : bool, default False + If the parsed data only contains one column then return a Series. + dtype : Type name or dict of column -> type, default None + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + Use `object` to preserve data as stored in Excel and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + engine : str, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None, "xlrd", "openpyxl" or "odf". + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. + true_values : list, default None + Values to consider as True. + false_values : list, default None + Values to consider as False. + skiprows : list-like + Rows to skip at the beginning (0-indexed). + nrows : int, default None + Number of rows to parse. + + .. versionadded:: 0.23.0 + + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted + as NaN: '""" + + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + +"""'. + keep_default_na : bool, default True + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. + na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file. + verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. + parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * bool. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparseable date, the entire column or + index will be returned unaltered as an object data type. For non-standard + datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + + Note: A fast-path exists for iso8601-formatted dates. + date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. + thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. + comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. + skip_footer : int, default 0 + Alias of `skipfooter`. + + .. deprecated:: 0.23.0 + Use `skipfooter` instead. + skipfooter : int, default 0 + Rows at the end to skip (0-indexed). + convert_float : bool, default True + Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally. + mangle_dupe_cols : bool, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. + **kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. -Returns -------- -DataFrame or dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheet_name - argument for more information on when a dict of DataFrames is returned. - -See Also --------- -to_excel : Write DataFrame to an Excel file. -to_csv : Write DataFrame to a comma-separated values (csv) file. -read_csv : Read a comma-separated values (csv) file into DataFrame. -read_fwf : Read a table of fixed-width formatted lines into DataFrame. - -Examples --------- -The file can be read using the file name as string or an open file object: - ->>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP - Name Value -0 string1 1 -1 string2 2 -2 #Comment 3 - ->>> pd.read_excel(open('tmp.xlsx', 'rb'), -... sheet_name='Sheet3') # doctest: +SKIP - Unnamed: 0 Name Value -0 0 string1 1 -1 1 string2 2 -2 2 #Comment 3 - -Index and header can be specified via the `index_col` and `header` arguments - ->>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP - 0 1 2 -0 NaN Name Value -1 0.0 string1 1 -2 1.0 string2 2 -3 2.0 #Comment 3 - -Column types are inferred but can be explicitly specified - ->>> pd.read_excel('tmp.xlsx', index_col=0, -... dtype={'Name': str, 'Value': float}) # doctest: +SKIP - Name Value -0 string1 1.0 -1 string2 2.0 -2 #Comment 3.0 - -True, False, and NA values, and thousands separators have defaults, -but can be explicitly specified, too. Supply the values you would like -as strings or lists of strings! - ->>> pd.read_excel('tmp.xlsx', index_col=0, -... na_values=['string1', 'string2']) # doctest: +SKIP - Name Value -0 NaN 1 -1 NaN 2 -2 #Comment 3 - -Comment lines in the excel input file can be skipped using the `comment` kwarg - ->>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP - Name Value -0 string1 1.0 -1 string2 2.0 -2 None NaN -""" + Returns + ------- + DataFrame or dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheet_name + argument for more information on when a dict of DataFrames is returned. + + See Also + -------- + to_excel : Write DataFrame to an Excel file. + to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + The file can be read using the file name as string or an open file object: + + >>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP + Name Value + 0 string1 1 + 1 string2 2 + 2 #Comment 3 + + >>> pd.read_excel(open('tmp.xlsx', 'rb'), + ... sheet_name='Sheet3') # doctest: +SKIP + Unnamed: 0 Name Value + 0 0 string1 1 + 1 1 string2 2 + 2 2 #Comment 3 + + Index and header can be specified via the `index_col` and `header` arguments + + >>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP + 0 1 2 + 0 NaN Name Value + 1 0.0 string1 1 + 2 1.0 string2 2 + 3 2.0 #Comment 3 + + Column types are inferred but can be explicitly specified + + >>> pd.read_excel('tmp.xlsx', index_col=0, + ... dtype={'Name': str, 'Value': float}) # doctest: +SKIP + Name Value + 0 string1 1.0 + 1 string2 2.0 + 2 #Comment 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_excel('tmp.xlsx', index_col=0, + ... na_values=['string1', 'string2']) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #Comment 3 + + Comment lines in the excel input file can be skipped using the `comment` kwarg + + >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP + Name Value + 0 string1 1.0 + 1 string2 2.0 + 2 None NaN + """ ) @@ -540,8 +540,10 @@ def parse( class ExcelWriter(metaclass=abc.ABCMeta): """ - Class for writing DataFrame objects into excel sheets, default is to use - xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage. + Class for writing DataFrame objects into excel sheets. + + Default is to use xlwt for xls, openpyxl for xlsx. + See DataFrame.to_excel for typical usage. Parameters ---------- @@ -554,8 +556,8 @@ class ExcelWriter(metaclass=abc.ABCMeta): date_format : str, default None Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). datetime_format : str, default None - Format string for datetime objects written into Excel files. - (e.g. 'YYYY-MM-DD HH:MM:SS') + Format string for datetime objects written into Excel files + (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' File mode to use (write or append). From 1d29e3cb5ebc097466a7bca79343b731aafd5c49 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 09:58:57 -0500 Subject: [PATCH 02/17] PR09 in read_msgpack --- pandas/io/packers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index c0ace7996e1b9..253441ab25813 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -191,7 +191,7 @@ def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): ``StringIO``. encoding : Encoding for decoding msgpack str type iterator : boolean, if True, return an iterator to the unpacker - (default is False) + (default is False). Returns ------- From 22df300bc97b8b73584ec651f89044823b59ed8a Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:02:05 -0500 Subject: [PATCH 03/17] PR09 in pandas.util.hash_pandas_object --- pandas/core/util/hashing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 011ea1b8e42f2..0a999f5c27536 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -67,11 +67,11 @@ def hash_pandas_object( Parameters ---------- index : bool, default True - include the index in the hash (if Series/DataFrame) + Include the index in the hash (if Series/DataFrame). encoding : str, default 'utf8' - encoding for data & key when strings + Encoding for data & key when strings. hash_key : str, default _default_hash_key - hash_key for string key to encode + Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. From bfcfdcb8f8f01d9cd648e05283874af1c5f7c2bf Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:03:30 -0500 Subject: [PATCH 04/17] PR09 in pandas.util.hash_array --- pandas/core/util/hashing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 0a999f5c27536..23c370638b572 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -253,9 +253,9 @@ def hash_array( ---------- vals : ndarray, Categorical encoding : str, default 'utf8' - encoding for data & key when strings + Encoding for data & key when strings. hash_key : str, default _default_hash_key - hash_key for string key to encode + Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. From ff6d420f09fb4b34aaa26370bc763a6bd80767e6 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:06:41 -0500 Subject: [PATCH 05/17] PR09 in pandas.interval_range --- pandas/core/indexes/interval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c9554016630cd..0c077702b4cb4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1407,24 +1407,24 @@ def interval_range( Parameters ---------- start : numeric or datetime-like, default None - Left bound for generating intervals + Left bound for generating intervals. end : numeric or datetime-like, default None - Right bound for generating intervals + Right bound for generating intervals. periods : int, default None - Number of periods to generate + Number of periods to generate. freq : numeric, str, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' for datetime-like. name : str, default None - Name of the resulting IntervalIndex + Name of the resulting IntervalIndex. closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. Returns ------- - rng : IntervalIndex + IntervalIndex See Also -------- From 007319619b9ada5a73c0ef8d906ed50e78ab337b Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:09:46 -0500 Subject: [PATCH 06/17] PR09 in pandas.period_range --- pandas/core/indexes/period.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index a0f16789621c7..ca7be9ba512da 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -997,28 +997,28 @@ def memory_usage(self, deep=False): def period_range(start=None, end=None, periods=None, freq=None, name=None): """ - Return a fixed frequency PeriodIndex, with day (calendar) as the default - frequency. + Return a fixed frequency PeriodIndex. + + The day (calendar) is the default frequency. Parameters ---------- start : str or period-like, default None - Left bound for generating periods + Left bound for generating periods. end : str or period-like, default None - Right bound for generating periods + Right bound for generating periods. periods : int, default None - Number of periods to generate + Number of periods to generate. freq : str or DateOffset, optional Frequency alias. By default the freq is taken from `start` or `end` if those are Period objects. Otherwise, the default is ``"D"`` for daily frequency. - name : str, default None - Name of the resulting PeriodIndex + Name of the resulting PeriodIndex. Returns ------- - prng : PeriodIndex + PeriodIndex Notes ----- From 3763e462af8ab0c2f2361ec19acdb2193e73da5e Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:31:05 -0500 Subject: [PATCH 07/17] PR09 in pandas.to_datetime --- pandas/core/tools/datetimes.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 70143e4603a4b..7eaa61dd9de29 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -577,14 +577,13 @@ def to_datetime( Parameters ---------- - arg : int, float, str, datetime, list, tuple, 1-d array, Series - or DataFrame/dict-like - + arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like + The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - - If 'raise', then invalid parsing will raise an exception - - If 'coerce', then invalid parsing will be set as NaT - - If 'ignore', then invalid parsing will return the input + Behaves as: + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as @@ -605,7 +604,7 @@ def to_datetime( Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : bool, default True - + Behaves as: - If True returns a DatetimeIndex or Index-like object - If False returns ndarray of values. @@ -615,17 +614,17 @@ def to_datetime( respectively. format : str, default None - strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse + The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. See strftime documentation for more information on choices: - https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. exact : bool, True by default - + Behaves as: - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : str, default 'ns' - unit of the arg (D,s,ms,us,ns) denote the unit, which is an + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. @@ -652,11 +651,12 @@ def to_datetime( .. versionadded:: 0.23.0 .. versionchanged:: 0.25.0 - - changed default value from False to True + - changed default value from False to True. Returns ------- - ret : datetime if parsing succeeded. + datetime + If parsing succeeded. Return type depends on input: - list-like: DatetimeIndex @@ -712,10 +712,10 @@ def to_datetime( 4 3/12/2000 dtype: object - >>> %timeit pd.to_datetime(s,infer_datetime_format=True) # doctest: +SKIP + >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP 100 loops, best of 3: 10.4 ms per loop - >>> %timeit pd.to_datetime(s,infer_datetime_format=False) # doctest: +SKIP + >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP 1 loop, best of 3: 471 ms per loop Using a unix epoch time From fdfbdf0093147b2d3f9930be4d3518ec8f65277c Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:36:40 -0500 Subject: [PATCH 08/17] PR09 in pandas.wide_to_long --- pandas/core/reshape/melt.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 98fee491e0a73..f7d9462d2ec32 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -206,12 +206,12 @@ def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): Parameters ---------- df : DataFrame - The wide-format DataFrame + The wide-format DataFrame. stubnames : str or list-like The stub name(s). The wide format variables are assumed to start with the stub names. i : str or list-like - Column(s) to use as id variable(s) + Column(s) to use as id variable(s). j : str The name of the sub-observation variable. What you wish to name your suffix in the long format. @@ -219,14 +219,14 @@ def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): A character indicating the separation of the variable names in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you - can strip the hyphen by specifying `sep='-'` + can strip the hyphen by specifying `sep='-'`. suffix : str, default '\\d+' A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate suffixes, for example, if your wide variables are of the form A-one, B-two,.., and you have an unrelated column A-rating, you can - ignore the last one by specifying `suffix='(!?one|two)'` + ignore the last one by specifying `suffix='(!?one|two)'`. .. versionchanged:: 0.23.0 When all suffixes are numeric, they are cast to int64/float64. @@ -360,7 +360,7 @@ def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): >>> stubnames = sorted( ... set([match[0] for match in df.columns.str.findall( - ... r'[A-B]\(.*\)').values if match != [] ]) + ... r'[A-B]\(.*\)').values if match != []]) ... ) >>> list(stubnames) ['A(weekly)', 'B(weekly)'] From e6935f2040effe6512e3607f0d5fad396057b73a Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:46:56 -0500 Subject: [PATCH 09/17] PR09 in pandas.merge_ordered --- pandas/core/reshape/merge.py | 42 +++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a189b2cd1ab84..67ecc5e801321 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -176,9 +176,10 @@ def merge_ordered( how="outer", ): """ - Perform merge with optional filling/interpolation designed for ordered - data like time series data. Optionally perform group-wise merge (see - examples). + Perform merge with optional filling/interpolation. + + Designed for ordered data like time series data. Optionally + perform group-wise merge (see examples). Parameters ---------- @@ -189,18 +190,18 @@ def merge_ordered( left_on : label or list, or array-like Field names to join on in left DataFrame. Can be a vector or list of vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns + the join key instead of columns. right_on : label or list, or array-like Field names to join on in right DataFrame or vector/list of vectors per - left_on docs + left_on docs. left_by : column name or list of column names Group left DataFrame by group columns and merge piece by piece with - right DataFrame + right DataFrame. right_by : column name or list of column names Group right DataFrame by group columns and merge piece by piece with - left DataFrame + left DataFrame. fill_method : {'ffill', None}, default None - Interpolation method for data + Interpolation method for data. suffixes : Sequence, default is ("_x", "_y") A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in @@ -211,16 +212,17 @@ def merge_ordered( .. versionchanged:: 0.25.0 how : {'left', 'right', 'outer', 'inner'}, default 'outer' + Behavior: * left: use only keys from left frame (SQL: left outer join) * right: use only keys from right frame (SQL: right outer join) * outer: use union of keys from both frames (SQL: full outer join) - * inner: use intersection of keys from both frames (SQL: inner join) + * inner: use intersection of keys from both frames (SQL: inner join). Returns ------- - merged : DataFrame - The output type will the be same as 'left', if it is a subclass - of DataFrame. + DataFrame + The merged DataFrame The output type will the be same as + 'left', if it is a subclass of DataFrame. See Also -------- @@ -229,15 +231,21 @@ def merge_ordered( Examples -------- - >>> A >>> B - key lvalue group key rvalue - 0 a 1 a 0 b 1 - 1 c 2 a 1 c 2 - 2 e 3 a 2 d 3 + >>> A + key lvalue group + 0 a 1 a + 1 c 2 a + 2 e 3 a 3 a 1 b 4 c 2 b 5 e 3 b + >>> B + Key rvalue + 0 b 1 + 1 c 2 + 2 d 3 + >>> merge_ordered(A, B, fill_method='ffill', left_by='group') group key lvalue rvalue 0 a a 1 NaN From 83c39706c97d493b37e0a1724a8530ccd62380f6 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 10:55:45 -0500 Subject: [PATCH 10/17] PR09 in pandas.pivot_table --- pandas/core/frame.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 40efc4c65476a..301cfa53e3e0b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5881,13 +5881,13 @@ def pivot(self, index=None, columns=None, values=None): hierarchical columns whose top level are the function names (inferred from the function objects themselves) If dict is passed, the key is column to aggregate and value - is function or list of functions + is function or list of functions. fill_value : scalar, default None - Value to replace missing values with + Value to replace missing values with. margins : bool, default False - Add all row / columns (e.g. for subtotal / grand totals) + Add all row / columns (e.g. for subtotal / grand totals). dropna : bool, default True - Do not include columns whose entries are all NaN + Do not include columns whose entries are all NaN. margins_name : str, default 'All' Name of the row / column that will contain the totals when margins is True. @@ -5901,6 +5901,7 @@ def pivot(self, index=None, columns=None, values=None): Returns ------- DataFrame + An Excel style pivot table. See Also -------- From a1bd2089bae6c7d20db8e4fc006d5ef3c95286c2 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 11:03:09 -0500 Subject: [PATCH 11/17] PEP8 formatting --- pandas/core/reshape/merge.py | 4 ++-- pandas/core/tools/datetimes.py | 5 +++++ pandas/io/excel/_base.py | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 67ecc5e801321..039dd99976cc2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -178,7 +178,7 @@ def merge_ordered( """ Perform merge with optional filling/interpolation. - Designed for ordered data like time series data. Optionally + Designed for ordered data like time series data. Optionally perform group-wise merge (see examples). Parameters @@ -221,7 +221,7 @@ def merge_ordered( Returns ------- DataFrame - The merged DataFrame The output type will the be same as + The merged DataFrame The output type will the be same as 'left', if it is a subclass of DataFrame. See Also diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7eaa61dd9de29..bc3d192d47d2d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -2,6 +2,11 @@ from datetime import datetime, time from functools import partial from typing import Optional, TypeVar, Union +- [ ] closes #xxxx +- [ ] tests added / passed +- [ ] passes `black pandas` +- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` +- [ ] whatsnew entry import numpy as np diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a3faeb3c247a1..f76207f47c2fc 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -130,8 +130,8 @@ Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" - + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") - +"""'. + + fill("', '".join(sorted(_NA_VALUES)), 70, + subsequent_indent=" ") + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. Depending on whether `na_values` is passed in, the behavior is as follows: From 8888d535f3876b799d83f6dea7db7c359fe21d42 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 11:04:59 -0500 Subject: [PATCH 12/17] undo paste --- pandas/core/tools/datetimes.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index bc3d192d47d2d..7eaa61dd9de29 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -2,11 +2,6 @@ from datetime import datetime, time from functools import partial from typing import Optional, TypeVar, Union -- [ ] closes #xxxx -- [ ] tests added / passed -- [ ] passes `black pandas` -- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` -- [ ] whatsnew entry import numpy as np From 25b48c29560b8f488bc790c03cd03490f53680f0 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 11:10:09 -0500 Subject: [PATCH 13/17] formatting of linebreaks --- pandas/core/tools/datetimes.py | 2 +- pandas/io/excel/_base.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7eaa61dd9de29..5b4fcff4c3653 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -655,7 +655,7 @@ def to_datetime( Returns ------- - datetime + datetime If parsing succeeded. Return type depends on input: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f76207f47c2fc..0a0df5806b47b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -130,8 +130,7 @@ Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" - + fill("', '".join(sorted(_NA_VALUES)), 70, - subsequent_indent=" ") + """'. + + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. Depending on whether `na_values` is passed in, the behavior is as follows: From 6cf6bf30a9eb5bb314cca8bdddaf7ae88b562b9e Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 11:14:01 -0500 Subject: [PATCH 14/17] add linebreak --- pandas/io/excel/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 0a0df5806b47b..b3279f7260d03 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -130,7 +130,8 @@ Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" - + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. + + fill("', '".join(sorted(_NA_VALUES)), 70, + subsequent_indent=" ") + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. Depending on whether `na_values` is passed in, the behavior is as follows: From afded0c5ed8e2bc31c10805c92ed1084c16ca45e Mon Sep 17 00:00:00 2001 From: hughkelley Date: Thu, 7 Nov 2019 12:14:40 -0500 Subject: [PATCH 15/17] Revert "PR09 in pandas.ExcelWriter and pandas.read_excel" This reverts commit 020e253c7f6eff26fa02877628e9ee4c5ec65e1a. --- pandas/io/excel/_base.py | 486 +++++++++++++++++++-------------------- 1 file changed, 243 insertions(+), 243 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b3279f7260d03..8e94d4699dc9b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -34,245 +34,247 @@ _read_excel_doc = ( """ - Read an Excel file into a pandas DataFrame. - - Support both `xls` and `xlsx` file extensions from a local filesystem or URL. - Support an option to read a single sheet or a list of sheets. - - Parameters - ---------- - io : str, ExcelFile, xlrd.Book, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: ``file://localhost/path/to/table.xlsx``. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) - or ``StringIO``. - sheet_name : str, int, list, or None, default 0 - Strings are used for sheet names. Integers are used in zero-indexed - sheet positions. Lists of strings/integers are used to request - multiple sheets. Specify None to get all sheets. - - Available cases: - - * Defaults to ``0``: 1st sheet as a `DataFrame` - * ``1``: 2nd sheet as a `DataFrame` - * ``"Sheet1"``: Load sheet with name "Sheet1" - * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" - as a dict of `DataFrame` - * None: All sheets. - - header : int, list of int, default 0 - Row (0-indexed) to use for the column labels of the parsed - DataFrame. If a list of integers is passed those row positions will - be combined into a ``MultiIndex``. Use None if there is no header. - names : array-like, default None - List of column names to use. If file contains no header row, - then you should explicitly pass header=None. - index_col : int, list of int, default None - Column (0-indexed) to use as the row labels of the DataFrame. - Pass None if there is no such column. If a list is passed, - those columns will be combined into a ``MultiIndex``. If a - subset of data is selected with ``usecols``, index_col - is based on the subset. - usecols : int, str, list-like, or callable default None - Return a subset of the columns. - - * If None, then parse all columns. - * If int, then indicates last column to be parsed. - - .. deprecated:: 0.24.0 - Pass in a list of int instead from 0 to `usecols` inclusive. - - * If str, then indicates comma separated list of Excel column letters - and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of - both sides. - * If list of int, then indicates list of column numbers to be parsed. - * If list of string, then indicates list of column names to be parsed. - - .. versionadded:: 0.24.0 - - * If callable, then evaluate each column name against it and parse the - column if the callable returns ``True``. - - .. versionadded:: 0.24.0 - - squeeze : bool, default False - If the parsed data only contains one column then return a Series. - dtype : Type name or dict of column -> type, default None - Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} - Use `object` to preserve data as stored in Excel and not interpret dtype. - If converters are specified, they will be applied INSTEAD - of dtype conversion. - engine : str, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None, "xlrd", "openpyxl" or "odf". - converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed - content. - true_values : list, default None - Values to consider as True. - false_values : list, default None - Values to consider as False. - skiprows : list-like - Rows to skip at the beginning (0-indexed). - nrows : int, default None - Number of rows to parse. - - .. versionadded:: 0.23.0 - - na_values : scalar, str, list-like, or dict, default None - Additional strings to recognize as NA/NaN. If dict passed, specific - per-column NA values. By default the following values are interpreted - as NaN: '""" - + fill("', '".join(sorted(_NA_VALUES)), 70, - subsequent_indent=" ") + """'. - keep_default_na : bool, default True - Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: - - * If `keep_default_na` is True, and `na_values` are specified, `na_values` - is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only - the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no - strings will be parsed as NaN. - - Note that if `na_filter` is passed in as False, the `keep_default_na` and - `na_values` parameters will be ignored. - na_filter : bool, default True - Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing na_filter=False can improve the performance - of reading a large file. - verbose : bool, default False - Indicate number of NA values placed in non-numeric columns. - parse_dates : bool, list-like, or dict, default False - The behavior is as follows: - - * bool. If True -> try parsing the index. - * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 - each as a separate date column. - * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as - a single date column. - * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call - result 'foo' - - If a column or index contains an unparseable date, the entire column or - index will be returned unaltered as an object data type. For non-standard - datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. - - Note: A fast-path exists for iso8601-formatted dates. - date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. - thousands : str, default None - Thousands separator for parsing string columns to numeric. Note that - this parameter is only necessary for columns stored as TEXT in Excel, - any numeric columns will automatically be parsed, regardless of display - format. - comment : str, default None - Comments out remainder of line. Pass a character or characters to this - argument to indicate comments in the input file. Any data between the - comment string and the end of the current line is ignored. - skip_footer : int, default 0 - Alias of `skipfooter`. - - .. deprecated:: 0.23.0 - Use `skipfooter` instead. - skipfooter : int, default 0 - Rows at the end to skip (0-indexed). - convert_float : bool, default True - Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats - internally. - mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. - **kwds : optional +Read an Excel file into a pandas DataFrame. + +Support both `xls` and `xlsx` file extensions from a local filesystem or URL. +Support an option to read a single sheet or a list of sheets. + +Parameters +---------- +io : str, ExcelFile, xlrd.Book, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.xlsx``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. +sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions. Lists of strings/integers are used to request + multiple sheets. Specify None to get all sheets. + + Available cases: + + * Defaults to ``0``: 1st sheet as a `DataFrame` + * ``1``: 2nd sheet as a `DataFrame` + * ``"Sheet1"``: Load sheet with name "Sheet1" + * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" + as a dict of `DataFrame` + * None: All sheets. + +header : int, list of int, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use None if there is no header. +names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None. +index_col : int, list of int, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex``. If a + subset of data is selected with ``usecols``, index_col + is based on the subset. +usecols : int, str, list-like, or callable default None + Behavior: + + * If None, then parse all columns. + * If int, then indicates last column to be parsed. + + .. deprecated:: 0.24.0 + Pass in a list of int instead from 0 to `usecols` inclusive. + + * If str, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. + * If list of int, then indicates list of column numbers to be parsed. + * If list of string, then indicates list of column names to be parsed. + + .. versionadded:: 0.24.0 + + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + Returns a subset of the columns according to behavior above. + + .. versionadded:: 0.24.0 + +squeeze : bool, default False + If the parsed data only contains one column then return a Series. +dtype : Type name or dict of column -> type, default None + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + Use `object` to preserve data as stored in Excel and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. +engine : str, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None, "xlrd", "openpyxl" or "odf". +converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. +true_values : list, default None + Values to consider as True. +false_values : list, default None + Values to consider as False. +skiprows : list-like + Rows to skip at the beginning (0-indexed). +nrows : int, default None + Number of rows to parse. + + .. versionadded:: 0.23.0 + +na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted + as NaN: '""" + + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + + """'. +keep_default_na : bool, default True + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. +na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file. +verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. +parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * bool. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparseable date, the entire column or + index will be returned unaltered as an object data type. For non-standard + datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + + Note: A fast-path exists for iso8601-formatted dates. +date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. +thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. +comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. +skip_footer : int, default 0 + Alias of `skipfooter`. + + .. deprecated:: 0.23.0 + Use `skipfooter` instead. +skipfooter : int, default 0 + Rows at the end to skip (0-indexed). +convert_float : bool, default True + Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally. +mangle_dupe_cols : bool, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. +**kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. - Returns - ------- - DataFrame or dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheet_name - argument for more information on when a dict of DataFrames is returned. - - See Also - -------- - to_excel : Write DataFrame to an Excel file. - to_csv : Write DataFrame to a comma-separated values (csv) file. - read_csv : Read a comma-separated values (csv) file into DataFrame. - read_fwf : Read a table of fixed-width formatted lines into DataFrame. - - Examples - -------- - The file can be read using the file name as string or an open file object: - - >>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP - Name Value - 0 string1 1 - 1 string2 2 - 2 #Comment 3 - - >>> pd.read_excel(open('tmp.xlsx', 'rb'), - ... sheet_name='Sheet3') # doctest: +SKIP - Unnamed: 0 Name Value - 0 0 string1 1 - 1 1 string2 2 - 2 2 #Comment 3 - - Index and header can be specified via the `index_col` and `header` arguments - - >>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP - 0 1 2 - 0 NaN Name Value - 1 0.0 string1 1 - 2 1.0 string2 2 - 3 2.0 #Comment 3 - - Column types are inferred but can be explicitly specified - - >>> pd.read_excel('tmp.xlsx', index_col=0, - ... dtype={'Name': str, 'Value': float}) # doctest: +SKIP - Name Value - 0 string1 1.0 - 1 string2 2.0 - 2 #Comment 3.0 - - True, False, and NA values, and thousands separators have defaults, - but can be explicitly specified, too. Supply the values you would like - as strings or lists of strings! - - >>> pd.read_excel('tmp.xlsx', index_col=0, - ... na_values=['string1', 'string2']) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #Comment 3 - - Comment lines in the excel input file can be skipped using the `comment` kwarg - - >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP - Name Value - 0 string1 1.0 - 1 string2 2.0 - 2 None NaN - """ +Returns +------- +DataFrame or dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheet_name + argument for more information on when a dict of DataFrames is returned. + +See Also +-------- +to_excel : Write DataFrame to an Excel file. +to_csv : Write DataFrame to a comma-separated values (csv) file. +read_csv : Read a comma-separated values (csv) file into DataFrame. +read_fwf : Read a table of fixed-width formatted lines into DataFrame. + +Examples +-------- +The file can be read using the file name as string or an open file object: + +>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP + Name Value +0 string1 1 +1 string2 2 +2 #Comment 3 + +>>> pd.read_excel(open('tmp.xlsx', 'rb'), +... sheet_name='Sheet3') # doctest: +SKIP + Unnamed: 0 Name Value +0 0 string1 1 +1 1 string2 2 +2 2 #Comment 3 + +Index and header can be specified via the `index_col` and `header` arguments + +>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP + 0 1 2 +0 NaN Name Value +1 0.0 string1 1 +2 1.0 string2 2 +3 2.0 #Comment 3 + +Column types are inferred but can be explicitly specified + +>>> pd.read_excel('tmp.xlsx', index_col=0, +... dtype={'Name': str, 'Value': float}) # doctest: +SKIP + Name Value +0 string1 1.0 +1 string2 2.0 +2 #Comment 3.0 + +True, False, and NA values, and thousands separators have defaults, +but can be explicitly specified, too. Supply the values you would like +as strings or lists of strings! + +>>> pd.read_excel('tmp.xlsx', index_col=0, +... na_values=['string1', 'string2']) # doctest: +SKIP + Name Value +0 NaN 1 +1 NaN 2 +2 #Comment 3 + +Comment lines in the excel input file can be skipped using the `comment` kwarg + +>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP + Name Value +0 string1 1.0 +1 string2 2.0 +2 None NaN +""" ) @@ -540,10 +542,8 @@ def parse( class ExcelWriter(metaclass=abc.ABCMeta): """ - Class for writing DataFrame objects into excel sheets. - - Default is to use xlwt for xls, openpyxl for xlsx. - See DataFrame.to_excel for typical usage. + Class for writing DataFrame objects into excel sheets, default is to use + xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage. Parameters ---------- @@ -556,8 +556,8 @@ class ExcelWriter(metaclass=abc.ABCMeta): date_format : str, default None Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). datetime_format : str, default None - Format string for datetime objects written into Excel files - (e.g. 'YYYY-MM-DD HH:MM:SS'). + Format string for datetime objects written into Excel files. + (e.g. 'YYYY-MM-DD HH:MM:SS') mode : {'w', 'a'}, default 'w' File mode to use (write or append). From 378c6bb8b35ba5ed30c71785b765b41b78d14ee1 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Fri, 8 Nov 2019 08:43:23 -0500 Subject: [PATCH 16/17] typo and remove text before list. --- pandas/core/reshape/merge.py | 3 +-- pandas/core/tools/datetimes.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 039dd99976cc2..956642b51ce97 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -212,7 +212,6 @@ def merge_ordered( .. versionchanged:: 0.25.0 how : {'left', 'right', 'outer', 'inner'}, default 'outer' - Behavior: * left: use only keys from left frame (SQL: left outer join) * right: use only keys from right frame (SQL: right outer join) * outer: use union of keys from both frames (SQL: full outer join) @@ -221,7 +220,7 @@ def merge_ordered( Returns ------- DataFrame - The merged DataFrame The output type will the be same as + The merged DataFrame output type will the be same as 'left', if it is a subclass of DataFrame. See Also diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 5b4fcff4c3653..ffe0ba3315dd7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -604,7 +604,6 @@ def to_datetime( Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : bool, default True - Behaves as: - If True returns a DatetimeIndex or Index-like object - If False returns ndarray of values. From fa8bb163426cf876d4ba302a70d47b0ba65fedc5 Mon Sep 17 00:00:00 2001 From: hughkelley Date: Fri, 8 Nov 2019 09:40:31 -0500 Subject: [PATCH 17/17] remove "Behaves" --- pandas/core/tools/datetimes.py | 1 - pandas/io/excel/_base.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ffe0ba3315dd7..bb8d15896b727 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -580,7 +580,6 @@ def to_datetime( arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - Behaves as: - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaT. - If 'ignore', then invalid parsing will return the input. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8e94d4699dc9b..d0ab6dd37596c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -79,8 +79,6 @@ subset of data is selected with ``usecols``, index_col is based on the subset. usecols : int, str, list-like, or callable default None - Behavior: - * If None, then parse all columns. * If int, then indicates last column to be parsed.